aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig6
-rw-r--r--net/ipv4/af_inet.c126
-rw-r--r--net/ipv4/ah4.c300
-rw-r--r--net/ipv4/arp.c65
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/datagram.c18
-rw-r--r--net/ipv4/devinet.c306
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_frontend.c57
-rw-r--r--net/ipv4/fib_hash.c26
-rw-r--r--net/ipv4/fib_rules.c16
-rw-r--r--net/ipv4/fib_semantics.c85
-rw-r--r--net/ipv4/fib_trie.c35
-rw-r--r--net/ipv4/icmp.c16
-rw-r--r--net/ipv4/igmp.c140
-rw-r--r--net/ipv4/inet_connection_sock.c29
-rw-r--r--net/ipv4/inet_diag.c31
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/inet_hashtables.c73
-rw-r--r--net/ipv4/inet_lro.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c154
-rw-r--r--net/ipv4/inetpeer.c5
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_fragment.c53
-rw-r--r--net/ipv4/ip_gre.c119
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c28
-rw-r--r--net/ipv4/ip_sockglue.c27
-rw-r--r--net/ipv4/ipcomp.c17
-rw-r--r--net/ipv4/ipconfig.c73
-rw-r--r--net/ipv4/ipip.c116
-rw-r--r--net/ipv4/ipmr.c50
-rw-r--r--net/ipv4/netfilter.c15
-rw-r--r--net/ipv4/netfilter/arp_tables.c412
-rw-r--r--net/ipv4/netfilter/arptable_filter.c96
-rw-r--r--net/ipv4/netfilter/ip_queue.c11
-rw-r--r--net/ipv4/netfilter/ip_tables.c607
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c35
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c10
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c4
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c5
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c11
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c4
-rw-r--r--net/ipv4/netfilter/iptable_filter.c125
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c171
-rw-r--r--net/ipv4/netfilter/iptable_raw.c97
-rw-r--r--net/ipv4/netfilter/iptable_security.c118
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c4
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c47
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c34
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c41
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c105
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c62
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c42
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c154
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c32
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c11
-rw-r--r--net/ipv4/proc.c34
-rw-r--r--net/ipv4/raw.c34
-rw-r--r--net/ipv4/route.c305
-rw-r--r--net/ipv4/syncookies.c11
-rw-r--r--net/ipv4/sysctl_net_ipv4.c190
-rw-r--r--net/ipv4/tcp.c450
-rw-r--r--net/ipv4/tcp_cong.c1
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_input.c142
-rw-r--r--net/ipv4/tcp_ipv4.c232
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_minisocks.c73
-rw-r--r--net/ipv4/tcp_output.c336
-rw-r--r--net/ipv4/tcp_probe.c33
-rw-r--r--net/ipv4/tcp_timer.c69
-rw-r--r--net/ipv4/tcp_veno.c5
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/tunnel4.c1
-rw-r--r--net/ipv4/udp.c515
-rw-r--r--net/ipv4/udplite.c9
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c1
-rw-r--r--net/ipv4/xfrm4_policy.c20
85 files changed, 3923 insertions, 2882 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 70491d9035eb..0c94a1ac2946 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -166,7 +166,7 @@ config IP_PNP_DHCP
166 166
167 If unsure, say Y. Note that if you want to use DHCP, a DHCP server 167 If unsure, say Y. Note that if you want to use DHCP, a DHCP server
168 must be operating on your network. Read 168 must be operating on your network. Read
169 <file:Documentation/filesystems/nfsroot.txt> for details. 169 <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
170 170
171config IP_PNP_BOOTP 171config IP_PNP_BOOTP
172 bool "IP: BOOTP support" 172 bool "IP: BOOTP support"
@@ -181,7 +181,7 @@ config IP_PNP_BOOTP
181 does BOOTP itself, providing all necessary information on the kernel 181 does BOOTP itself, providing all necessary information on the kernel
182 command line, you can say N here. If unsure, say Y. Note that if you 182 command line, you can say N here. If unsure, say Y. Note that if you
183 want to use BOOTP, a BOOTP server must be operating on your network. 183 want to use BOOTP, a BOOTP server must be operating on your network.
184 Read <file:Documentation/filesystems/nfsroot.txt> for details. 184 Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
185 185
186config IP_PNP_RARP 186config IP_PNP_RARP
187 bool "IP: RARP support" 187 bool "IP: RARP support"
@@ -194,7 +194,7 @@ config IP_PNP_RARP
194 older protocol which is being obsoleted by BOOTP and DHCP), say Y 194 older protocol which is being obsoleted by BOOTP and DHCP), say Y
195 here. Note that if you want to use RARP, a RARP server must be 195 here. Note that if you want to use RARP, a RARP server must be
196 operating on your network. Read 196 operating on your network. Read
197 <file:Documentation/filesystems/nfsroot.txt> for details. 197 <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
198 198
199# not yet ready.. 199# not yet ready..
200# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 200# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 57737b8d1711..f71357422380 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -86,6 +86,7 @@
86#include <linux/poll.h> 86#include <linux/poll.h>
87#include <linux/netfilter_ipv4.h> 87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h> 88#include <linux/random.h>
89#include <linux/slab.h>
89 90
90#include <asm/uaccess.h> 91#include <asm/uaccess.h>
91#include <asm/system.h> 92#include <asm/system.h>
@@ -174,12 +175,12 @@ static int inet_autobind(struct sock *sk)
174 /* We may need to bind the socket. */ 175 /* We may need to bind the socket. */
175 lock_sock(sk); 176 lock_sock(sk);
176 inet = inet_sk(sk); 177 inet = inet_sk(sk);
177 if (!inet->num) { 178 if (!inet->inet_num) {
178 if (sk->sk_prot->get_port(sk, 0)) { 179 if (sk->sk_prot->get_port(sk, 0)) {
179 release_sock(sk); 180 release_sock(sk);
180 return -EAGAIN; 181 return -EAGAIN;
181 } 182 }
182 inet->sport = htons(inet->num); 183 inet->inet_sport = htons(inet->inet_num);
183 } 184 }
184 release_sock(sk); 185 release_sock(sk);
185 return 0; 186 return 0;
@@ -262,7 +263,8 @@ static inline int inet_netns_ok(struct net *net, int protocol)
262 * Create an inet socket. 263 * Create an inet socket.
263 */ 264 */
264 265
265static int inet_create(struct net *net, struct socket *sock, int protocol) 266static int inet_create(struct net *net, struct socket *sock, int protocol,
267 int kern)
266{ 268{
267 struct sock *sk; 269 struct sock *sk;
268 struct inet_protosw *answer; 270 struct inet_protosw *answer;
@@ -325,7 +327,7 @@ lookup_protocol:
325 } 327 }
326 328
327 err = -EPERM; 329 err = -EPERM;
328 if (answer->capability > 0 && !capable(answer->capability)) 330 if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
329 goto out_rcu_unlock; 331 goto out_rcu_unlock;
330 332
331 err = -EAFNOSUPPORT; 333 err = -EAFNOSUPPORT;
@@ -354,7 +356,7 @@ lookup_protocol:
354 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 356 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
355 357
356 if (SOCK_RAW == sock->type) { 358 if (SOCK_RAW == sock->type) {
357 inet->num = protocol; 359 inet->inet_num = protocol;
358 if (IPPROTO_RAW == protocol) 360 if (IPPROTO_RAW == protocol)
359 inet->hdrincl = 1; 361 inet->hdrincl = 1;
360 } 362 }
@@ -364,7 +366,7 @@ lookup_protocol:
364 else 366 else
365 inet->pmtudisc = IP_PMTUDISC_WANT; 367 inet->pmtudisc = IP_PMTUDISC_WANT;
366 368
367 inet->id = 0; 369 inet->inet_id = 0;
368 370
369 sock_init_data(sock, sk); 371 sock_init_data(sock, sk);
370 372
@@ -381,13 +383,13 @@ lookup_protocol:
381 383
382 sk_refcnt_debug_inc(sk); 384 sk_refcnt_debug_inc(sk);
383 385
384 if (inet->num) { 386 if (inet->inet_num) {
385 /* It assumes that any protocol which allows 387 /* It assumes that any protocol which allows
386 * the user to assign a number at socket 388 * the user to assign a number at socket
387 * creation time automatically 389 * creation time automatically
388 * shares. 390 * shares.
389 */ 391 */
390 inet->sport = htons(inet->num); 392 inet->inet_sport = htons(inet->inet_num);
391 /* Add to protocol hash chains. */ 393 /* Add to protocol hash chains. */
392 sk->sk_prot->hash(sk); 394 sk->sk_prot->hash(sk);
393 } 395 }
@@ -494,27 +496,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
494 496
495 /* Check these errors (active socket, double bind). */ 497 /* Check these errors (active socket, double bind). */
496 err = -EINVAL; 498 err = -EINVAL;
497 if (sk->sk_state != TCP_CLOSE || inet->num) 499 if (sk->sk_state != TCP_CLOSE || inet->inet_num)
498 goto out_release_sock; 500 goto out_release_sock;
499 501
500 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 502 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
501 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 503 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
502 inet->saddr = 0; /* Use device */ 504 inet->inet_saddr = 0; /* Use device */
503 505
504 /* Make sure we are allowed to bind here. */ 506 /* Make sure we are allowed to bind here. */
505 if (sk->sk_prot->get_port(sk, snum)) { 507 if (sk->sk_prot->get_port(sk, snum)) {
506 inet->saddr = inet->rcv_saddr = 0; 508 inet->inet_saddr = inet->inet_rcv_saddr = 0;
507 err = -EADDRINUSE; 509 err = -EADDRINUSE;
508 goto out_release_sock; 510 goto out_release_sock;
509 } 511 }
510 512
511 if (inet->rcv_saddr) 513 if (inet->inet_rcv_saddr)
512 sk->sk_userlocks |= SOCK_BINDADDR_LOCK; 514 sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
513 if (snum) 515 if (snum)
514 sk->sk_userlocks |= SOCK_BINDPORT_LOCK; 516 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
515 inet->sport = htons(inet->num); 517 inet->inet_sport = htons(inet->inet_num);
516 inet->daddr = 0; 518 inet->inet_daddr = 0;
517 inet->dport = 0; 519 inet->inet_dport = 0;
518 sk_dst_reset(sk); 520 sk_dst_reset(sk);
519 err = 0; 521 err = 0;
520out_release_sock: 522out_release_sock:
@@ -529,10 +531,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
529{ 531{
530 struct sock *sk = sock->sk; 532 struct sock *sk = sock->sk;
531 533
534 if (addr_len < sizeof(uaddr->sa_family))
535 return -EINVAL;
532 if (uaddr->sa_family == AF_UNSPEC) 536 if (uaddr->sa_family == AF_UNSPEC)
533 return sk->sk_prot->disconnect(sk, flags); 537 return sk->sk_prot->disconnect(sk, flags);
534 538
535 if (!inet_sk(sk)->num && inet_autobind(sk)) 539 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
536 return -EAGAIN; 540 return -EAGAIN;
537 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); 541 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
538} 542}
@@ -572,6 +576,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
572 int err; 576 int err;
573 long timeo; 577 long timeo;
574 578
579 if (addr_len < sizeof(uaddr->sa_family))
580 return -EINVAL;
581
575 lock_sock(sk); 582 lock_sock(sk);
576 583
577 if (uaddr->sa_family == AF_UNSPEC) { 584 if (uaddr->sa_family == AF_UNSPEC) {
@@ -685,21 +692,21 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
685{ 692{
686 struct sock *sk = sock->sk; 693 struct sock *sk = sock->sk;
687 struct inet_sock *inet = inet_sk(sk); 694 struct inet_sock *inet = inet_sk(sk);
688 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 695 DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
689 696
690 sin->sin_family = AF_INET; 697 sin->sin_family = AF_INET;
691 if (peer) { 698 if (peer) {
692 if (!inet->dport || 699 if (!inet->inet_dport ||
693 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && 700 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
694 peer == 1)) 701 peer == 1))
695 return -ENOTCONN; 702 return -ENOTCONN;
696 sin->sin_port = inet->dport; 703 sin->sin_port = inet->inet_dport;
697 sin->sin_addr.s_addr = inet->daddr; 704 sin->sin_addr.s_addr = inet->inet_daddr;
698 } else { 705 } else {
699 __be32 addr = inet->rcv_saddr; 706 __be32 addr = inet->inet_rcv_saddr;
700 if (!addr) 707 if (!addr)
701 addr = inet->saddr; 708 addr = inet->inet_saddr;
702 sin->sin_port = inet->sport; 709 sin->sin_port = inet->inet_sport;
703 sin->sin_addr.s_addr = addr; 710 sin->sin_addr.s_addr = addr;
704 } 711 }
705 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 712 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
@@ -714,7 +721,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
714 struct sock *sk = sock->sk; 721 struct sock *sk = sock->sk;
715 722
716 /* We may need to bind the socket. */ 723 /* We may need to bind the socket. */
717 if (!inet_sk(sk)->num && inet_autobind(sk)) 724 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
718 return -EAGAIN; 725 return -EAGAIN;
719 726
720 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 727 return sk->sk_prot->sendmsg(iocb, sk, msg, size);
@@ -728,7 +735,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
728 struct sock *sk = sock->sk; 735 struct sock *sk = sock->sk;
729 736
730 /* We may need to bind the socket. */ 737 /* We may need to bind the socket. */
731 if (!inet_sk(sk)->num && inet_autobind(sk)) 738 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
732 return -EAGAIN; 739 return -EAGAIN;
733 740
734 if (sk->sk_prot->sendpage) 741 if (sk->sk_prot->sendpage)
@@ -931,7 +938,7 @@ static const struct proto_ops inet_sockraw_ops = {
931#endif 938#endif
932}; 939};
933 940
934static struct net_proto_family inet_family_ops = { 941static const struct net_proto_family inet_family_ops = {
935 .family = PF_INET, 942 .family = PF_INET,
936 .create = inet_create, 943 .create = inet_create,
937 .owner = THIS_MODULE, 944 .owner = THIS_MODULE,
@@ -947,7 +954,6 @@ static struct inet_protosw inetsw_array[] =
947 .protocol = IPPROTO_TCP, 954 .protocol = IPPROTO_TCP,
948 .prot = &tcp_prot, 955 .prot = &tcp_prot,
949 .ops = &inet_stream_ops, 956 .ops = &inet_stream_ops,
950 .capability = -1,
951 .no_check = 0, 957 .no_check = 0,
952 .flags = INET_PROTOSW_PERMANENT | 958 .flags = INET_PROTOSW_PERMANENT |
953 INET_PROTOSW_ICSK, 959 INET_PROTOSW_ICSK,
@@ -958,7 +964,6 @@ static struct inet_protosw inetsw_array[] =
958 .protocol = IPPROTO_UDP, 964 .protocol = IPPROTO_UDP,
959 .prot = &udp_prot, 965 .prot = &udp_prot,
960 .ops = &inet_dgram_ops, 966 .ops = &inet_dgram_ops,
961 .capability = -1,
962 .no_check = UDP_CSUM_DEFAULT, 967 .no_check = UDP_CSUM_DEFAULT,
963 .flags = INET_PROTOSW_PERMANENT, 968 .flags = INET_PROTOSW_PERMANENT,
964 }, 969 },
@@ -969,7 +974,6 @@ static struct inet_protosw inetsw_array[] =
969 .protocol = IPPROTO_IP, /* wild card */ 974 .protocol = IPPROTO_IP, /* wild card */
970 .prot = &raw_prot, 975 .prot = &raw_prot,
971 .ops = &inet_sockraw_ops, 976 .ops = &inet_sockraw_ops,
972 .capability = CAP_NET_RAW,
973 .no_check = UDP_CSUM_DEFAULT, 977 .no_check = UDP_CSUM_DEFAULT,
974 .flags = INET_PROTOSW_REUSE, 978 .flags = INET_PROTOSW_REUSE,
975 } 979 }
@@ -1059,9 +1063,9 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1059 struct inet_sock *inet = inet_sk(sk); 1063 struct inet_sock *inet = inet_sk(sk);
1060 int err; 1064 int err;
1061 struct rtable *rt; 1065 struct rtable *rt;
1062 __be32 old_saddr = inet->saddr; 1066 __be32 old_saddr = inet->inet_saddr;
1063 __be32 new_saddr; 1067 __be32 new_saddr;
1064 __be32 daddr = inet->daddr; 1068 __be32 daddr = inet->inet_daddr;
1065 1069
1066 if (inet->opt && inet->opt->srr) 1070 if (inet->opt && inet->opt->srr)
1067 daddr = inet->opt->faddr; 1071 daddr = inet->opt->faddr;
@@ -1071,7 +1075,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1071 RT_CONN_FLAGS(sk), 1075 RT_CONN_FLAGS(sk),
1072 sk->sk_bound_dev_if, 1076 sk->sk_bound_dev_if,
1073 sk->sk_protocol, 1077 sk->sk_protocol,
1074 inet->sport, inet->dport, sk, 0); 1078 inet->inet_sport, inet->inet_dport, sk, 0);
1075 if (err) 1079 if (err)
1076 return err; 1080 return err;
1077 1081
@@ -1087,7 +1091,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1087 __func__, &old_saddr, &new_saddr); 1091 __func__, &old_saddr, &new_saddr);
1088 } 1092 }
1089 1093
1090 inet->saddr = inet->rcv_saddr = new_saddr; 1094 inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
1091 1095
1092 /* 1096 /*
1093 * XXX The only one ugly spot where we need to 1097 * XXX The only one ugly spot where we need to
@@ -1113,7 +1117,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1113 return 0; 1117 return 0;
1114 1118
1115 /* Reroute. */ 1119 /* Reroute. */
1116 daddr = inet->daddr; 1120 daddr = inet->inet_daddr;
1117 if (inet->opt && inet->opt->srr) 1121 if (inet->opt && inet->opt->srr)
1118 daddr = inet->opt->faddr; 1122 daddr = inet->opt->faddr;
1119{ 1123{
@@ -1123,7 +1127,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1123 .nl_u = { 1127 .nl_u = {
1124 .ip4_u = { 1128 .ip4_u = {
1125 .daddr = daddr, 1129 .daddr = daddr,
1126 .saddr = inet->saddr, 1130 .saddr = inet->inet_saddr,
1127 .tos = RT_CONN_FLAGS(sk), 1131 .tos = RT_CONN_FLAGS(sk),
1128 }, 1132 },
1129 }, 1133 },
@@ -1131,8 +1135,8 @@ int inet_sk_rebuild_header(struct sock *sk)
1131 .flags = inet_sk_flowi_flags(sk), 1135 .flags = inet_sk_flowi_flags(sk),
1132 .uli_u = { 1136 .uli_u = {
1133 .ports = { 1137 .ports = {
1134 .sport = inet->sport, 1138 .sport = inet->inet_sport,
1135 .dport = inet->dport, 1139 .dport = inet->inet_dport,
1136 }, 1140 },
1137 }, 1141 },
1138 }; 1142 };
@@ -1387,7 +1391,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
1387} 1391}
1388EXPORT_SYMBOL_GPL(inet_ctl_sock_create); 1392EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
1389 1393
1390unsigned long snmp_fold_field(void *mib[], int offt) 1394unsigned long snmp_fold_field(void __percpu *mib[], int offt)
1391{ 1395{
1392 unsigned long res = 0; 1396 unsigned long res = 0;
1393 int i; 1397 int i;
@@ -1400,7 +1404,7 @@ unsigned long snmp_fold_field(void *mib[], int offt)
1400} 1404}
1401EXPORT_SYMBOL_GPL(snmp_fold_field); 1405EXPORT_SYMBOL_GPL(snmp_fold_field);
1402 1406
1403int snmp_mib_init(void *ptr[2], size_t mibsize) 1407int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
1404{ 1408{
1405 BUG_ON(ptr == NULL); 1409 BUG_ON(ptr == NULL);
1406 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); 1410 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
@@ -1418,7 +1422,7 @@ err0:
1418} 1422}
1419EXPORT_SYMBOL_GPL(snmp_mib_init); 1423EXPORT_SYMBOL_GPL(snmp_mib_init);
1420 1424
1421void snmp_mib_free(void *ptr[2]) 1425void snmp_mib_free(void __percpu *ptr[2])
1422{ 1426{
1423 BUG_ON(ptr == NULL); 1427 BUG_ON(ptr == NULL);
1424 free_percpu(ptr[0]); 1428 free_percpu(ptr[0]);
@@ -1462,25 +1466,25 @@ static const struct net_protocol icmp_protocol = {
1462 1466
1463static __net_init int ipv4_mib_init_net(struct net *net) 1467static __net_init int ipv4_mib_init_net(struct net *net)
1464{ 1468{
1465 if (snmp_mib_init((void **)net->mib.tcp_statistics, 1469 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
1466 sizeof(struct tcp_mib)) < 0) 1470 sizeof(struct tcp_mib)) < 0)
1467 goto err_tcp_mib; 1471 goto err_tcp_mib;
1468 if (snmp_mib_init((void **)net->mib.ip_statistics, 1472 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
1469 sizeof(struct ipstats_mib)) < 0) 1473 sizeof(struct ipstats_mib)) < 0)
1470 goto err_ip_mib; 1474 goto err_ip_mib;
1471 if (snmp_mib_init((void **)net->mib.net_statistics, 1475 if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
1472 sizeof(struct linux_mib)) < 0) 1476 sizeof(struct linux_mib)) < 0)
1473 goto err_net_mib; 1477 goto err_net_mib;
1474 if (snmp_mib_init((void **)net->mib.udp_statistics, 1478 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
1475 sizeof(struct udp_mib)) < 0) 1479 sizeof(struct udp_mib)) < 0)
1476 goto err_udp_mib; 1480 goto err_udp_mib;
1477 if (snmp_mib_init((void **)net->mib.udplite_statistics, 1481 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
1478 sizeof(struct udp_mib)) < 0) 1482 sizeof(struct udp_mib)) < 0)
1479 goto err_udplite_mib; 1483 goto err_udplite_mib;
1480 if (snmp_mib_init((void **)net->mib.icmp_statistics, 1484 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
1481 sizeof(struct icmp_mib)) < 0) 1485 sizeof(struct icmp_mib)) < 0)
1482 goto err_icmp_mib; 1486 goto err_icmp_mib;
1483 if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, 1487 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
1484 sizeof(struct icmpmsg_mib)) < 0) 1488 sizeof(struct icmpmsg_mib)) < 0)
1485 goto err_icmpmsg_mib; 1489 goto err_icmpmsg_mib;
1486 1490
@@ -1488,30 +1492,30 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1488 return 0; 1492 return 0;
1489 1493
1490err_icmpmsg_mib: 1494err_icmpmsg_mib:
1491 snmp_mib_free((void **)net->mib.icmp_statistics); 1495 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1492err_icmp_mib: 1496err_icmp_mib:
1493 snmp_mib_free((void **)net->mib.udplite_statistics); 1497 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1494err_udplite_mib: 1498err_udplite_mib:
1495 snmp_mib_free((void **)net->mib.udp_statistics); 1499 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
1496err_udp_mib: 1500err_udp_mib:
1497 snmp_mib_free((void **)net->mib.net_statistics); 1501 snmp_mib_free((void __percpu **)net->mib.net_statistics);
1498err_net_mib: 1502err_net_mib:
1499 snmp_mib_free((void **)net->mib.ip_statistics); 1503 snmp_mib_free((void __percpu **)net->mib.ip_statistics);
1500err_ip_mib: 1504err_ip_mib:
1501 snmp_mib_free((void **)net->mib.tcp_statistics); 1505 snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
1502err_tcp_mib: 1506err_tcp_mib:
1503 return -ENOMEM; 1507 return -ENOMEM;
1504} 1508}
1505 1509
1506static __net_exit void ipv4_mib_exit_net(struct net *net) 1510static __net_exit void ipv4_mib_exit_net(struct net *net)
1507{ 1511{
1508 snmp_mib_free((void **)net->mib.icmpmsg_statistics); 1512 snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
1509 snmp_mib_free((void **)net->mib.icmp_statistics); 1513 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1510 snmp_mib_free((void **)net->mib.udplite_statistics); 1514 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1511 snmp_mib_free((void **)net->mib.udp_statistics); 1515 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
1512 snmp_mib_free((void **)net->mib.net_statistics); 1516 snmp_mib_free((void __percpu **)net->mib.net_statistics);
1513 snmp_mib_free((void **)net->mib.ip_statistics); 1517 snmp_mib_free((void __percpu **)net->mib.ip_statistics);
1514 snmp_mib_free((void **)net->mib.tcp_statistics); 1518 snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
1515} 1519}
1516 1520
1517static __net_initdata struct pernet_operations ipv4_mib_ops = { 1521static __net_initdata struct pernet_operations ipv4_mib_ops = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5c662703eb1e..880a5ec6dce0 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,14 +1,73 @@
1#include <crypto/hash.h>
1#include <linux/err.h> 2#include <linux/err.h>
2#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
3#include <net/ip.h> 5#include <net/ip.h>
4#include <net/xfrm.h> 6#include <net/xfrm.h>
5#include <net/ah.h> 7#include <net/ah.h>
6#include <linux/crypto.h> 8#include <linux/crypto.h>
7#include <linux/pfkeyv2.h> 9#include <linux/pfkeyv2.h>
8#include <linux/spinlock.h> 10#include <linux/scatterlist.h>
9#include <net/icmp.h> 11#include <net/icmp.h>
10#include <net/protocol.h> 12#include <net/protocol.h>
11 13
14struct ah_skb_cb {
15 struct xfrm_skb_cb xfrm;
16 void *tmp;
17};
18
19#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
20
21static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
22 unsigned int size)
23{
24 unsigned int len;
25
26 len = size + crypto_ahash_digestsize(ahash) +
27 (crypto_ahash_alignmask(ahash) &
28 ~(crypto_tfm_ctx_alignment() - 1));
29
30 len = ALIGN(len, crypto_tfm_ctx_alignment());
31
32 len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
33 len = ALIGN(len, __alignof__(struct scatterlist));
34
35 len += sizeof(struct scatterlist) * nfrags;
36
37 return kmalloc(len, GFP_ATOMIC);
38}
39
40static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
41{
42 return tmp + offset;
43}
44
45static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
46 unsigned int offset)
47{
48 return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
49}
50
51static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
52 u8 *icv)
53{
54 struct ahash_request *req;
55
56 req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
57 crypto_tfm_ctx_alignment());
58
59 ahash_request_set_tfm(req, ahash);
60
61 return req;
62}
63
64static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
65 struct ahash_request *req)
66{
67 return (void *)ALIGN((unsigned long)(req + 1) +
68 crypto_ahash_reqsize(ahash),
69 __alignof__(struct scatterlist));
70}
12 71
13/* Clear mutable options and find final destination to substitute 72/* Clear mutable options and find final destination to substitute
14 * into IP header for icv calculation. Options are already checked 73 * into IP header for icv calculation. Options are already checked
@@ -54,20 +113,72 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
54 return 0; 113 return 0;
55} 114}
56 115
116static void ah_output_done(struct crypto_async_request *base, int err)
117{
118 u8 *icv;
119 struct iphdr *iph;
120 struct sk_buff *skb = base->data;
121 struct xfrm_state *x = skb_dst(skb)->xfrm;
122 struct ah_data *ahp = x->data;
123 struct iphdr *top_iph = ip_hdr(skb);
124 struct ip_auth_hdr *ah = ip_auth_hdr(skb);
125 int ihl = ip_hdrlen(skb);
126
127 iph = AH_SKB_CB(skb)->tmp;
128 icv = ah_tmp_icv(ahp->ahash, iph, ihl);
129 memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
130
131 top_iph->tos = iph->tos;
132 top_iph->ttl = iph->ttl;
133 top_iph->frag_off = iph->frag_off;
134 if (top_iph->ihl != 5) {
135 top_iph->daddr = iph->daddr;
136 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
137 }
138
139 err = ah->nexthdr;
140
141 kfree(AH_SKB_CB(skb)->tmp);
142 xfrm_output_resume(skb, err);
143}
144
57static int ah_output(struct xfrm_state *x, struct sk_buff *skb) 145static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
58{ 146{
59 int err; 147 int err;
148 int nfrags;
149 int ihl;
150 u8 *icv;
151 struct sk_buff *trailer;
152 struct crypto_ahash *ahash;
153 struct ahash_request *req;
154 struct scatterlist *sg;
60 struct iphdr *iph, *top_iph; 155 struct iphdr *iph, *top_iph;
61 struct ip_auth_hdr *ah; 156 struct ip_auth_hdr *ah;
62 struct ah_data *ahp; 157 struct ah_data *ahp;
63 union { 158
64 struct iphdr iph; 159 ahp = x->data;
65 char buf[60]; 160 ahash = ahp->ahash;
66 } tmp_iph; 161
162 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
163 goto out;
164 nfrags = err;
67 165
68 skb_push(skb, -skb_network_offset(skb)); 166 skb_push(skb, -skb_network_offset(skb));
167 ah = ip_auth_hdr(skb);
168 ihl = ip_hdrlen(skb);
169
170 err = -ENOMEM;
171 iph = ah_alloc_tmp(ahash, nfrags, ihl);
172 if (!iph)
173 goto out;
174
175 icv = ah_tmp_icv(ahash, iph, ihl);
176 req = ah_tmp_req(ahash, icv);
177 sg = ah_req_sg(ahash, req);
178
179 memset(ah->auth_data, 0, ahp->icv_trunc_len);
180
69 top_iph = ip_hdr(skb); 181 top_iph = ip_hdr(skb);
70 iph = &tmp_iph.iph;
71 182
72 iph->tos = top_iph->tos; 183 iph->tos = top_iph->tos;
73 iph->ttl = top_iph->ttl; 184 iph->ttl = top_iph->ttl;
@@ -78,10 +189,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
78 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); 189 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
79 err = ip_clear_mutable_options(top_iph, &top_iph->daddr); 190 err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
80 if (err) 191 if (err)
81 goto error; 192 goto out_free;
82 } 193 }
83 194
84 ah = ip_auth_hdr(skb);
85 ah->nexthdr = *skb_mac_header(skb); 195 ah->nexthdr = *skb_mac_header(skb);
86 *skb_mac_header(skb) = IPPROTO_AH; 196 *skb_mac_header(skb) = IPPROTO_AH;
87 197
@@ -91,20 +201,31 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
91 top_iph->ttl = 0; 201 top_iph->ttl = 0;
92 top_iph->check = 0; 202 top_iph->check = 0;
93 203
94 ahp = x->data;
95 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
96 205
97 ah->reserved = 0; 206 ah->reserved = 0;
98 ah->spi = x->id.spi; 207 ah->spi = x->id.spi;
99 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 208 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
100 209
101 spin_lock_bh(&x->lock); 210 sg_init_table(sg, nfrags);
102 err = ah_mac_digest(ahp, skb, ah->auth_data); 211 skb_to_sgvec(skb, sg, 0, skb->len);
103 memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
104 spin_unlock_bh(&x->lock);
105 212
106 if (err) 213 ahash_request_set_crypt(req, sg, icv, skb->len);
107 goto error; 214 ahash_request_set_callback(req, 0, ah_output_done, skb);
215
216 AH_SKB_CB(skb)->tmp = iph;
217
218 err = crypto_ahash_digest(req);
219 if (err) {
220 if (err == -EINPROGRESS)
221 goto out;
222
223 if (err == -EBUSY)
224 err = NET_XMIT_DROP;
225 goto out_free;
226 }
227
228 memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
108 229
109 top_iph->tos = iph->tos; 230 top_iph->tos = iph->tos;
110 top_iph->ttl = iph->ttl; 231 top_iph->ttl = iph->ttl;
@@ -114,28 +235,67 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
114 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); 235 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
115 } 236 }
116 237
117 err = 0; 238out_free:
118 239 kfree(iph);
119error: 240out:
120 return err; 241 return err;
121} 242}
122 243
244static void ah_input_done(struct crypto_async_request *base, int err)
245{
246 u8 *auth_data;
247 u8 *icv;
248 struct iphdr *work_iph;
249 struct sk_buff *skb = base->data;
250 struct xfrm_state *x = xfrm_input_state(skb);
251 struct ah_data *ahp = x->data;
252 struct ip_auth_hdr *ah = ip_auth_hdr(skb);
253 int ihl = ip_hdrlen(skb);
254 int ah_hlen = (ah->hdrlen + 2) << 2;
255
256 work_iph = AH_SKB_CB(skb)->tmp;
257 auth_data = ah_tmp_auth(work_iph, ihl);
258 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
259
260 err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
261 if (err)
262 goto out;
263
264 skb->network_header += ah_hlen;
265 memcpy(skb_network_header(skb), work_iph, ihl);
266 __skb_pull(skb, ah_hlen + ihl);
267 skb_set_transport_header(skb, -ihl);
268
269 err = ah->nexthdr;
270out:
271 kfree(AH_SKB_CB(skb)->tmp);
272 xfrm_input_resume(skb, err);
273}
274
123static int ah_input(struct xfrm_state *x, struct sk_buff *skb) 275static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
124{ 276{
125 int ah_hlen; 277 int ah_hlen;
126 int ihl; 278 int ihl;
127 int nexthdr; 279 int nexthdr;
128 int err = -EINVAL; 280 int nfrags;
129 struct iphdr *iph; 281 u8 *auth_data;
282 u8 *icv;
283 struct sk_buff *trailer;
284 struct crypto_ahash *ahash;
285 struct ahash_request *req;
286 struct scatterlist *sg;
287 struct iphdr *iph, *work_iph;
130 struct ip_auth_hdr *ah; 288 struct ip_auth_hdr *ah;
131 struct ah_data *ahp; 289 struct ah_data *ahp;
132 char work_buf[60]; 290 int err = -ENOMEM;
133 291
134 if (!pskb_may_pull(skb, sizeof(*ah))) 292 if (!pskb_may_pull(skb, sizeof(*ah)))
135 goto out; 293 goto out;
136 294
137 ah = (struct ip_auth_hdr *)skb->data; 295 ah = (struct ip_auth_hdr *)skb->data;
138 ahp = x->data; 296 ahp = x->data;
297 ahash = ahp->ahash;
298
139 nexthdr = ah->nexthdr; 299 nexthdr = ah->nexthdr;
140 ah_hlen = (ah->hdrlen + 2) << 2; 300 ah_hlen = (ah->hdrlen + 2) << 2;
141 301
@@ -156,9 +316,24 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
156 316
157 ah = (struct ip_auth_hdr *)skb->data; 317 ah = (struct ip_auth_hdr *)skb->data;
158 iph = ip_hdr(skb); 318 iph = ip_hdr(skb);
319 ihl = ip_hdrlen(skb);
320
321 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
322 goto out;
323 nfrags = err;
324
325 work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
326 if (!work_iph)
327 goto out;
328
329 auth_data = ah_tmp_auth(work_iph, ihl);
330 icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
331 req = ah_tmp_req(ahash, icv);
332 sg = ah_req_sg(ahash, req);
159 333
160 ihl = skb->data - skb_network_header(skb); 334 memcpy(work_iph, iph, ihl);
161 memcpy(work_buf, iph, ihl); 335 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
336 memset(ah->auth_data, 0, ahp->icv_trunc_len);
162 337
163 iph->ttl = 0; 338 iph->ttl = 0;
164 iph->tos = 0; 339 iph->tos = 0;
@@ -166,35 +341,44 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
166 iph->check = 0; 341 iph->check = 0;
167 if (ihl > sizeof(*iph)) { 342 if (ihl > sizeof(*iph)) {
168 __be32 dummy; 343 __be32 dummy;
169 if (ip_clear_mutable_options(iph, &dummy)) 344 err = ip_clear_mutable_options(iph, &dummy);
170 goto out; 345 if (err)
346 goto out_free;
171 } 347 }
172 348
173 spin_lock(&x->lock); 349 skb_push(skb, ihl);
174 {
175 u8 auth_data[MAX_AH_AUTH_LEN];
176 350
177 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 351 sg_init_table(sg, nfrags);
178 skb_push(skb, ihl); 352 skb_to_sgvec(skb, sg, 0, skb->len);
179 err = ah_mac_digest(ahp, skb, ah->auth_data); 353
180 if (err) 354 ahash_request_set_crypt(req, sg, icv, skb->len);
181 goto unlock; 355 ahash_request_set_callback(req, 0, ah_input_done, skb);
182 if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) 356
183 err = -EBADMSG; 357 AH_SKB_CB(skb)->tmp = work_iph;
358
359 err = crypto_ahash_digest(req);
360 if (err) {
361 if (err == -EINPROGRESS)
362 goto out;
363
364 if (err == -EBUSY)
365 err = NET_XMIT_DROP;
366 goto out_free;
184 } 367 }
185unlock:
186 spin_unlock(&x->lock);
187 368
369 err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
188 if (err) 370 if (err)
189 goto out; 371 goto out_free;
190 372
191 skb->network_header += ah_hlen; 373 skb->network_header += ah_hlen;
192 memcpy(skb_network_header(skb), work_buf, ihl); 374 memcpy(skb_network_header(skb), work_iph, ihl);
193 skb->transport_header = skb->network_header;
194 __skb_pull(skb, ah_hlen + ihl); 375 __skb_pull(skb, ah_hlen + ihl);
376 skb_set_transport_header(skb, -ihl);
195 377
196 return nexthdr; 378 err = nexthdr;
197 379
380out_free:
381 kfree (work_iph);
198out: 382out:
199 return err; 383 return err;
200} 384}
@@ -210,7 +394,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
210 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 394 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
211 return; 395 return;
212 396
213 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 397 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
214 if (!x) 398 if (!x)
215 return; 399 return;
216 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 400 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -222,7 +406,7 @@ static int ah_init_state(struct xfrm_state *x)
222{ 406{
223 struct ah_data *ahp = NULL; 407 struct ah_data *ahp = NULL;
224 struct xfrm_algo_desc *aalg_desc; 408 struct xfrm_algo_desc *aalg_desc;
225 struct crypto_hash *tfm; 409 struct crypto_ahash *ahash;
226 410
227 if (!x->aalg) 411 if (!x->aalg)
228 goto error; 412 goto error;
@@ -231,44 +415,40 @@ static int ah_init_state(struct xfrm_state *x)
231 goto error; 415 goto error;
232 416
233 ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); 417 ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
234 if (ahp == NULL) 418 if (!ahp)
235 return -ENOMEM; 419 return -ENOMEM;
236 420
237 tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); 421 ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
238 if (IS_ERR(tfm)) 422 if (IS_ERR(ahash))
239 goto error; 423 goto error;
240 424
241 ahp->tfm = tfm; 425 ahp->ahash = ahash;
242 if (crypto_hash_setkey(tfm, x->aalg->alg_key, 426 if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
243 (x->aalg->alg_key_len + 7) / 8)) 427 (x->aalg->alg_key_len + 7) / 8))
244 goto error; 428 goto error;
245 429
246 /* 430 /*
247 * Lookup the algorithm description maintained by xfrm_algo, 431 * Lookup the algorithm description maintained by xfrm_algo,
248 * verify crypto transform properties, and store information 432 * verify crypto transform properties, and store information
249 * we need for AH processing. This lookup cannot fail here 433 * we need for AH processing. This lookup cannot fail here
250 * after a successful crypto_alloc_hash(). 434 * after a successful crypto_alloc_ahash().
251 */ 435 */
252 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); 436 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
253 BUG_ON(!aalg_desc); 437 BUG_ON(!aalg_desc);
254 438
255 if (aalg_desc->uinfo.auth.icv_fullbits/8 != 439 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
256 crypto_hash_digestsize(tfm)) { 440 crypto_ahash_digestsize(ahash)) {
257 printk(KERN_INFO "AH: %s digestsize %u != %hu\n", 441 printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
258 x->aalg->alg_name, crypto_hash_digestsize(tfm), 442 x->aalg->alg_name, crypto_ahash_digestsize(ahash),
259 aalg_desc->uinfo.auth.icv_fullbits/8); 443 aalg_desc->uinfo.auth.icv_fullbits/8);
260 goto error; 444 goto error;
261 } 445 }
262 446
263 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; 447 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
264 ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; 448 ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
265 449
266 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 450 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
267 451
268 ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
269 if (!ahp->work_icv)
270 goto error;
271
272 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 452 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
273 ahp->icv_trunc_len); 453 ahp->icv_trunc_len);
274 if (x->props.mode == XFRM_MODE_TUNNEL) 454 if (x->props.mode == XFRM_MODE_TUNNEL)
@@ -279,8 +459,7 @@ static int ah_init_state(struct xfrm_state *x)
279 459
280error: 460error:
281 if (ahp) { 461 if (ahp) {
282 kfree(ahp->work_icv); 462 crypto_free_ahash(ahp->ahash);
283 crypto_free_hash(ahp->tfm);
284 kfree(ahp); 463 kfree(ahp);
285 } 464 }
286 return -EINVAL; 465 return -EINVAL;
@@ -293,8 +472,7 @@ static void ah_destroy(struct xfrm_state *x)
293 if (!ahp) 472 if (!ahp)
294 return; 473 return;
295 474
296 kfree(ahp->work_icv); 475 crypto_free_ahash(ahp->ahash);
297 crypto_free_hash(ahp->tfm);
298 kfree(ahp); 476 kfree(ahp);
299} 477}
300 478
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4e80f336c0cf..80769f1f9fab 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -70,6 +70,7 @@
70 * bonding can change the skb before 70 * bonding can change the skb before
71 * sending (e.g. insert 8021q tag). 71 * sending (e.g. insert 8021q tag).
72 * Harald Welte : convert to make use of jenkins hash 72 * Harald Welte : convert to make use of jenkins hash
73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
73 */ 74 */
74 75
75#include <linux/module.h> 76#include <linux/module.h>
@@ -97,6 +98,7 @@
97#include <linux/net.h> 98#include <linux/net.h>
98#include <linux/rcupdate.h> 99#include <linux/rcupdate.h>
99#include <linux/jhash.h> 100#include <linux/jhash.h>
101#include <linux/slab.h>
100#ifdef CONFIG_SYSCTL 102#ifdef CONFIG_SYSCTL
101#include <linux/sysctl.h> 103#include <linux/sysctl.h>
102#endif 104#endif
@@ -524,12 +526,15 @@ int arp_bind_neighbour(struct dst_entry *dst)
524/* 526/*
525 * Check if we can use proxy ARP for this path 527 * Check if we can use proxy ARP for this path
526 */ 528 */
527 529static inline int arp_fwd_proxy(struct in_device *in_dev,
528static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) 530 struct net_device *dev, struct rtable *rt)
529{ 531{
530 struct in_device *out_dev; 532 struct in_device *out_dev;
531 int imi, omi = -1; 533 int imi, omi = -1;
532 534
535 if (rt->u.dst.dev == dev)
536 return 0;
537
533 if (!IN_DEV_PROXY_ARP(in_dev)) 538 if (!IN_DEV_PROXY_ARP(in_dev))
534 return 0; 539 return 0;
535 540
@@ -548,6 +553,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
548} 553}
549 554
550/* 555/*
556 * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
557 *
558 * RFC3069 supports proxy arp replies back to the same interface. This
559 * is done to support (ethernet) switch features, like RFC 3069, where
560 * the individual ports are not allowed to communicate with each
561 * other, BUT they are allowed to talk to the upstream router. As
562 * described in RFC 3069, it is possible to allow these hosts to
563 * communicate through the upstream router, by proxy_arp'ing.
564 *
565 * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
566 *
567 * This technology is known by different names:
568 * In RFC 3069 it is called VLAN Aggregation.
569 * Cisco and Allied Telesyn call it Private VLAN.
570 * Hewlett-Packard call it Source-Port filtering or port-isolation.
571 * Ericsson call it MAC-Forced Forwarding (RFC Draft).
572 *
573 */
574static inline int arp_fwd_pvlan(struct in_device *in_dev,
575 struct net_device *dev, struct rtable *rt,
576 __be32 sip, __be32 tip)
577{
578 /* Private VLAN is only concerned about the same ethernet segment */
579 if (rt->u.dst.dev != dev)
580 return 0;
581
582 /* Don't reply on self probes (often done by windowz boxes)*/
583 if (sip == tip)
584 return 0;
585
586 if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
587 return 1;
588 else
589 return 0;
590}
591
592/*
551 * Interface to link layer: send routine and receive handler. 593 * Interface to link layer: send routine and receive handler.
552 */ 594 */
553 595
@@ -619,13 +661,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
619#endif 661#endif
620#endif 662#endif
621 663
622#ifdef CONFIG_FDDI 664#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
623 case ARPHRD_FDDI: 665 case ARPHRD_FDDI:
624 arp->ar_hrd = htons(ARPHRD_ETHER); 666 arp->ar_hrd = htons(ARPHRD_ETHER);
625 arp->ar_pro = htons(ETH_P_IP); 667 arp->ar_pro = htons(ETH_P_IP);
626 break; 668 break;
627#endif 669#endif
628#ifdef CONFIG_TR 670#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
629 case ARPHRD_IEEE802_TR: 671 case ARPHRD_IEEE802_TR:
630 arp->ar_hrd = htons(ARPHRD_IEEE802); 672 arp->ar_hrd = htons(ARPHRD_IEEE802);
631 arp->ar_pro = htons(ETH_P_IP); 673 arp->ar_pro = htons(ETH_P_IP);
@@ -833,8 +875,11 @@ static int arp_process(struct sk_buff *skb)
833 } 875 }
834 goto out; 876 goto out;
835 } else if (IN_DEV_FORWARD(in_dev)) { 877 } else if (IN_DEV_FORWARD(in_dev)) {
836 if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && 878 if (addr_type == RTN_UNICAST &&
837 (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { 879 (arp_fwd_proxy(in_dev, dev, rt) ||
880 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
881 pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
882 {
838 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 883 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
839 if (n) 884 if (n)
840 neigh_release(n); 885 neigh_release(n);
@@ -863,7 +908,8 @@ static int arp_process(struct sk_buff *skb)
863 devices (strip is candidate) 908 devices (strip is candidate)
864 */ 909 */
865 if (n == NULL && 910 if (n == NULL &&
866 arp->ar_op == htons(ARPOP_REPLY) && 911 (arp->ar_op == htons(ARPOP_REPLY) ||
912 (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
867 inet_addr_type(net, sip) == RTN_UNICAST) 913 inet_addr_type(net, sip) == RTN_UNICAST)
868 n = __neigh_lookup(&arp_tbl, &sip, dev, 1); 914 n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
869 } 915 }
@@ -1005,7 +1051,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1005 return -EINVAL; 1051 return -EINVAL;
1006 } 1052 }
1007 switch (dev->type) { 1053 switch (dev->type) {
1008#ifdef CONFIG_FDDI 1054#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
1009 case ARPHRD_FDDI: 1055 case ARPHRD_FDDI:
1010 /* 1056 /*
1011 * According to RFC 1390, FDDI devices should accept ARP 1057 * According to RFC 1390, FDDI devices should accept ARP
@@ -1239,8 +1285,7 @@ void __init arp_init(void)
1239 dev_add_pack(&arp_packet_type); 1285 dev_add_pack(&arp_packet_type);
1240 arp_proc_init(); 1286 arp_proc_init();
1241#ifdef CONFIG_SYSCTL 1287#ifdef CONFIG_SYSCTL
1242 neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, 1288 neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
1243 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
1244#endif 1289#endif
1245 register_netdevice_notifier(&arp_netdev_notifier); 1290 register_netdevice_notifier(&arp_netdev_notifier);
1246} 1291}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 039cc1ffe977..c97cd9ff697e 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -44,6 +44,7 @@
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/jhash.h> 45#include <linux/jhash.h>
46#include <linux/audit.h> 46#include <linux/audit.h>
47#include <linux/slab.h>
47#include <net/ip.h> 48#include <net/ip.h>
48#include <net/icmp.h> 49#include <net/icmp.h>
49#include <net/tcp.h> 50#include <net/tcp.h>
@@ -2017,7 +2018,7 @@ req_setattr_failure:
2017 * values on failure. 2018 * values on failure.
2018 * 2019 *
2019 */ 2020 */
2020int cipso_v4_delopt(struct ip_options **opt_ptr) 2021static int cipso_v4_delopt(struct ip_options **opt_ptr)
2021{ 2022{
2022 int hdr_delta = 0; 2023 int hdr_delta = 0;
2023 struct ip_options *opt = *opt_ptr; 2024 struct ip_options *opt = *opt_ptr;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 5e6c5a0f3fde..fb2465811b48 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
39 sk_dst_reset(sk); 39 sk_dst_reset(sk);
40 40
41 oif = sk->sk_bound_dev_if; 41 oif = sk->sk_bound_dev_if;
42 saddr = inet->saddr; 42 saddr = inet->inet_saddr;
43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) { 43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
44 if (!oif) 44 if (!oif)
45 oif = inet->mc_index; 45 oif = inet->mc_index;
@@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif, 50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol, 51 sk->sk_protocol,
52 inet->sport, usin->sin_port, sk, 1); 52 inet->inet_sport, usin->sin_port, sk, 1);
53 if (err) { 53 if (err) {
54 if (err == -ENETUNREACH) 54 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -60,14 +60,14 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
60 ip_rt_put(rt); 60 ip_rt_put(rt);
61 return -EACCES; 61 return -EACCES;
62 } 62 }
63 if (!inet->saddr) 63 if (!inet->inet_saddr)
64 inet->saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->rcv_saddr) 65 if (!inet->inet_rcv_saddr)
66 inet->rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 inet->daddr = rt->rt_dst; 67 inet->inet_daddr = rt->rt_dst;
68 inet->dport = usin->sin_port; 68 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 69 sk->sk_state = TCP_ESTABLISHED;
70 inet->id = jiffies; 70 inet->inet_id = jiffies;
71 71
72 sk_dst_set(sk, &rt->u.dst); 72 sk_dst_set(sk, &rt->u.dst);
73 return(0); 73 return(0);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5df2f6a0b0f0..90e3d6379a42 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -50,6 +50,7 @@
50#include <linux/notifier.h> 50#include <linux/notifier.h>
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h>
53#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
54#include <linux/sysctl.h> 55#include <linux/sysctl.h>
55#endif 56#endif
@@ -64,20 +65,20 @@
64 65
65static struct ipv4_devconf ipv4_devconf = { 66static struct ipv4_devconf ipv4_devconf = {
66 .data = { 67 .data = {
67 [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, 68 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
68 [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, 69 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
69 [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, 70 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
70 [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, 71 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
71 }, 72 },
72}; 73};
73 74
74static struct ipv4_devconf ipv4_devconf_dflt = { 75static struct ipv4_devconf ipv4_devconf_dflt = {
75 .data = { 76 .data = {
76 [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, 77 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
77 [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, 78 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
78 [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, 79 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
79 [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, 80 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
80 [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1, 81 [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
81 }, 82 },
82}; 83};
83 84
@@ -140,11 +141,11 @@ void in_dev_finish_destroy(struct in_device *idev)
140#endif 141#endif
141 dev_put(dev); 142 dev_put(dev);
142 if (!idev->dead) 143 if (!idev->dead)
143 printk("Freeing alive in_device %p\n", idev); 144 pr_err("Freeing alive in_device %p\n", idev);
144 else { 145 else
145 kfree(idev); 146 kfree(idev);
146 }
147} 147}
148EXPORT_SYMBOL(in_dev_finish_destroy);
148 149
149static struct in_device *inetdev_init(struct net_device *dev) 150static struct in_device *inetdev_init(struct net_device *dev)
150{ 151{
@@ -159,7 +160,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
159 sizeof(in_dev->cnf)); 160 sizeof(in_dev->cnf));
160 in_dev->cnf.sysctl = NULL; 161 in_dev->cnf.sysctl = NULL;
161 in_dev->dev = dev; 162 in_dev->dev = dev;
162 if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) 163 in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
164 if (!in_dev->arp_parms)
163 goto out_kfree; 165 goto out_kfree;
164 if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) 166 if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
165 dev_disable_lro(dev); 167 dev_disable_lro(dev);
@@ -405,13 +407,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
405{ 407{
406 struct net_device *dev; 408 struct net_device *dev;
407 struct in_device *in_dev = NULL; 409 struct in_device *in_dev = NULL;
408 read_lock(&dev_base_lock); 410
409 dev = __dev_get_by_index(net, ifindex); 411 rcu_read_lock();
412 dev = dev_get_by_index_rcu(net, ifindex);
410 if (dev) 413 if (dev)
411 in_dev = in_dev_get(dev); 414 in_dev = in_dev_get(dev);
412 read_unlock(&dev_base_lock); 415 rcu_read_unlock();
413 return in_dev; 416 return in_dev;
414} 417}
418EXPORT_SYMBOL(inetdev_by_index);
415 419
416/* Called only from RTNL semaphored context. No locks. */ 420/* Called only from RTNL semaphored context. No locks. */
417 421
@@ -557,7 +561,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
557 * Determine a default network mask, based on the IP address. 561 * Determine a default network mask, based on the IP address.
558 */ 562 */
559 563
560static __inline__ int inet_abc_len(__be32 addr) 564static inline int inet_abc_len(__be32 addr)
561{ 565{
562 int rc = -1; /* Something else, probably a multicast. */ 566 int rc = -1; /* Something else, probably a multicast. */
563 567
@@ -646,13 +650,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
646 rtnl_lock(); 650 rtnl_lock();
647 651
648 ret = -ENODEV; 652 ret = -ENODEV;
649 if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) 653 dev = __dev_get_by_name(net, ifr.ifr_name);
654 if (!dev)
650 goto done; 655 goto done;
651 656
652 if (colon) 657 if (colon)
653 *colon = ':'; 658 *colon = ':';
654 659
655 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 660 in_dev = __in_dev_get_rtnl(dev);
661 if (in_dev) {
656 if (tryaddrmatch) { 662 if (tryaddrmatch) {
657 /* Matthias Andree */ 663 /* Matthias Andree */
658 /* compare label and address (4.4BSD style) */ 664 /* compare label and address (4.4BSD style) */
@@ -720,7 +726,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
720 726
721 if (!ifa) { 727 if (!ifa) {
722 ret = -ENOBUFS; 728 ret = -ENOBUFS;
723 if ((ifa = inet_alloc_ifa()) == NULL) 729 ifa = inet_alloc_ifa();
730 if (!ifa)
724 break; 731 break;
725 if (colon) 732 if (colon)
726 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); 733 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
@@ -822,10 +829,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
822 struct ifreq ifr; 829 struct ifreq ifr;
823 int done = 0; 830 int done = 0;
824 831
825 if (!in_dev || (ifa = in_dev->ifa_list) == NULL) 832 if (!in_dev)
826 goto out; 833 goto out;
827 834
828 for (; ifa; ifa = ifa->ifa_next) { 835 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
829 if (!buf) { 836 if (!buf) {
830 done += sizeof(ifr); 837 done += sizeof(ifr);
831 continue; 838 continue;
@@ -875,36 +882,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
875 if (!addr) 882 if (!addr)
876 addr = ifa->ifa_local; 883 addr = ifa->ifa_local;
877 } endfor_ifa(in_dev); 884 } endfor_ifa(in_dev);
878no_in_dev:
879 rcu_read_unlock();
880 885
881 if (addr) 886 if (addr)
882 goto out; 887 goto out_unlock;
888no_in_dev:
883 889
884 /* Not loopback addresses on loopback should be preferred 890 /* Not loopback addresses on loopback should be preferred
885 in this case. It is importnat that lo is the first interface 891 in this case. It is importnat that lo is the first interface
886 in dev_base list. 892 in dev_base list.
887 */ 893 */
888 read_lock(&dev_base_lock); 894 for_each_netdev_rcu(net, dev) {
889 rcu_read_lock(); 895 in_dev = __in_dev_get_rcu(dev);
890 for_each_netdev(net, dev) { 896 if (!in_dev)
891 if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
892 continue; 897 continue;
893 898
894 for_primary_ifa(in_dev) { 899 for_primary_ifa(in_dev) {
895 if (ifa->ifa_scope != RT_SCOPE_LINK && 900 if (ifa->ifa_scope != RT_SCOPE_LINK &&
896 ifa->ifa_scope <= scope) { 901 ifa->ifa_scope <= scope) {
897 addr = ifa->ifa_local; 902 addr = ifa->ifa_local;
898 goto out_unlock_both; 903 goto out_unlock;
899 } 904 }
900 } endfor_ifa(in_dev); 905 } endfor_ifa(in_dev);
901 } 906 }
902out_unlock_both: 907out_unlock:
903 read_unlock(&dev_base_lock);
904 rcu_read_unlock(); 908 rcu_read_unlock();
905out:
906 return addr; 909 return addr;
907} 910}
911EXPORT_SYMBOL(inet_select_addr);
908 912
909static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, 913static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
910 __be32 local, int scope) 914 __be32 local, int scope)
@@ -940,7 +944,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
940 } 944 }
941 } endfor_ifa(in_dev); 945 } endfor_ifa(in_dev);
942 946
943 return same? addr : 0; 947 return same ? addr : 0;
944} 948}
945 949
946/* 950/*
@@ -961,17 +965,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
961 return confirm_addr_indev(in_dev, dst, local, scope); 965 return confirm_addr_indev(in_dev, dst, local, scope);
962 966
963 net = dev_net(in_dev->dev); 967 net = dev_net(in_dev->dev);
964 read_lock(&dev_base_lock);
965 rcu_read_lock(); 968 rcu_read_lock();
966 for_each_netdev(net, dev) { 969 for_each_netdev_rcu(net, dev) {
967 if ((in_dev = __in_dev_get_rcu(dev))) { 970 in_dev = __in_dev_get_rcu(dev);
971 if (in_dev) {
968 addr = confirm_addr_indev(in_dev, dst, local, scope); 972 addr = confirm_addr_indev(in_dev, dst, local, scope);
969 if (addr) 973 if (addr)
970 break; 974 break;
971 } 975 }
972 } 976 }
973 rcu_read_unlock(); 977 rcu_read_unlock();
974 read_unlock(&dev_base_lock);
975 978
976 return addr; 979 return addr;
977} 980}
@@ -984,14 +987,16 @@ int register_inetaddr_notifier(struct notifier_block *nb)
984{ 987{
985 return blocking_notifier_chain_register(&inetaddr_chain, nb); 988 return blocking_notifier_chain_register(&inetaddr_chain, nb);
986} 989}
990EXPORT_SYMBOL(register_inetaddr_notifier);
987 991
988int unregister_inetaddr_notifier(struct notifier_block *nb) 992int unregister_inetaddr_notifier(struct notifier_block *nb)
989{ 993{
990 return blocking_notifier_chain_unregister(&inetaddr_chain, nb); 994 return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
991} 995}
996EXPORT_SYMBOL(unregister_inetaddr_notifier);
992 997
993/* Rename ifa_labels for a device name change. Make some effort to preserve existing 998/* Rename ifa_labels for a device name change. Make some effort to preserve
994 * alias numbering and to create unique labels if possible. 999 * existing alias numbering and to create unique labels if possible.
995*/ 1000*/
996static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) 1001static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
997{ 1002{
@@ -1010,11 +1015,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
1010 sprintf(old, ":%d", named); 1015 sprintf(old, ":%d", named);
1011 dot = old; 1016 dot = old;
1012 } 1017 }
1013 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { 1018 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
1014 strcat(ifa->ifa_label, dot); 1019 strcat(ifa->ifa_label, dot);
1015 } else { 1020 else
1016 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); 1021 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
1017 }
1018skip: 1022skip:
1019 rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); 1023 rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
1020 } 1024 }
@@ -1061,8 +1065,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1061 if (!inetdev_valid_mtu(dev->mtu)) 1065 if (!inetdev_valid_mtu(dev->mtu))
1062 break; 1066 break;
1063 if (dev->flags & IFF_LOOPBACK) { 1067 if (dev->flags & IFF_LOOPBACK) {
1064 struct in_ifaddr *ifa; 1068 struct in_ifaddr *ifa = inet_alloc_ifa();
1065 if ((ifa = inet_alloc_ifa()) != NULL) { 1069
1070 if (ifa) {
1066 ifa->ifa_local = 1071 ifa->ifa_local =
1067 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1072 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1068 ifa->ifa_prefixlen = 8; 1073 ifa->ifa_prefixlen = 8;
@@ -1170,38 +1175,54 @@ nla_put_failure:
1170static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 1175static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1171{ 1176{
1172 struct net *net = sock_net(skb->sk); 1177 struct net *net = sock_net(skb->sk);
1173 int idx, ip_idx; 1178 int h, s_h;
1179 int idx, s_idx;
1180 int ip_idx, s_ip_idx;
1174 struct net_device *dev; 1181 struct net_device *dev;
1175 struct in_device *in_dev; 1182 struct in_device *in_dev;
1176 struct in_ifaddr *ifa; 1183 struct in_ifaddr *ifa;
1177 int s_ip_idx, s_idx = cb->args[0]; 1184 struct hlist_head *head;
1185 struct hlist_node *node;
1178 1186
1179 s_ip_idx = ip_idx = cb->args[1]; 1187 s_h = cb->args[0];
1180 idx = 0; 1188 s_idx = idx = cb->args[1];
1181 for_each_netdev(net, dev) { 1189 s_ip_idx = ip_idx = cb->args[2];
1182 if (idx < s_idx) 1190
1183 goto cont; 1191 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1184 if (idx > s_idx) 1192 idx = 0;
1185 s_ip_idx = 0; 1193 head = &net->dev_index_head[h];
1186 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) 1194 rcu_read_lock();
1187 goto cont; 1195 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
1188 1196 if (idx < s_idx)
1189 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; 1197 goto cont;
1190 ifa = ifa->ifa_next, ip_idx++) { 1198 if (h > s_h || idx > s_idx)
1191 if (ip_idx < s_ip_idx) 1199 s_ip_idx = 0;
1192 continue; 1200 in_dev = __in_dev_get_rcu(dev);
1193 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1201 if (!in_dev)
1202 goto cont;
1203
1204 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
1205 ifa = ifa->ifa_next, ip_idx++) {
1206 if (ip_idx < s_ip_idx)
1207 continue;
1208 if (inet_fill_ifaddr(skb, ifa,
1209 NETLINK_CB(cb->skb).pid,
1194 cb->nlh->nlmsg_seq, 1210 cb->nlh->nlmsg_seq,
1195 RTM_NEWADDR, NLM_F_MULTI) <= 0) 1211 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1196 goto done; 1212 rcu_read_unlock();
1197 } 1213 goto done;
1214 }
1215 }
1198cont: 1216cont:
1199 idx++; 1217 idx++;
1218 }
1219 rcu_read_unlock();
1200 } 1220 }
1201 1221
1202done: 1222done:
1203 cb->args[0] = idx; 1223 cb->args[0] = h;
1204 cb->args[1] = ip_idx; 1224 cb->args[1] = idx;
1225 cb->args[2] = ip_idx;
1205 1226
1206 return skb->len; 1227 return skb->len;
1207} 1228}
@@ -1239,18 +1260,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i)
1239{ 1260{
1240 struct net_device *dev; 1261 struct net_device *dev;
1241 1262
1242 read_lock(&dev_base_lock); 1263 rcu_read_lock();
1243 for_each_netdev(net, dev) { 1264 for_each_netdev_rcu(net, dev) {
1244 struct in_device *in_dev; 1265 struct in_device *in_dev;
1245 rcu_read_lock(); 1266
1246 in_dev = __in_dev_get_rcu(dev); 1267 in_dev = __in_dev_get_rcu(dev);
1247 if (in_dev && !test_bit(i, in_dev->cnf.state)) 1268 if (in_dev && !test_bit(i, in_dev->cnf.state))
1248 in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; 1269 in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
1249 rcu_read_unlock();
1250 } 1270 }
1251 read_unlock(&dev_base_lock); 1271 rcu_read_unlock();
1252} 1272}
1253 1273
1274/* called with RTNL locked */
1254static void inet_forward_change(struct net *net) 1275static void inet_forward_change(struct net *net)
1255{ 1276{
1256 struct net_device *dev; 1277 struct net_device *dev;
@@ -1259,7 +1280,6 @@ static void inet_forward_change(struct net *net)
1259 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; 1280 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
1260 IPV4_DEVCONF_DFLT(net, FORWARDING) = on; 1281 IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
1261 1282
1262 read_lock(&dev_base_lock);
1263 for_each_netdev(net, dev) { 1283 for_each_netdev(net, dev) {
1264 struct in_device *in_dev; 1284 struct in_device *in_dev;
1265 if (on) 1285 if (on)
@@ -1270,7 +1290,6 @@ static void inet_forward_change(struct net *net)
1270 IN_DEV_CONF_SET(in_dev, FORWARDING, on); 1290 IN_DEV_CONF_SET(in_dev, FORWARDING, on);
1271 rcu_read_unlock(); 1291 rcu_read_unlock();
1272 } 1292 }
1273 read_unlock(&dev_base_lock);
1274} 1293}
1275 1294
1276static int devinet_conf_proc(ctl_table *ctl, int write, 1295static int devinet_conf_proc(ctl_table *ctl, int write,
@@ -1293,72 +1312,25 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
1293 return ret; 1312 return ret;
1294} 1313}
1295 1314
1296static int devinet_conf_sysctl(ctl_table *table,
1297 void __user *oldval, size_t __user *oldlenp,
1298 void __user *newval, size_t newlen)
1299{
1300 struct ipv4_devconf *cnf;
1301 struct net *net;
1302 int *valp = table->data;
1303 int new;
1304 int i;
1305
1306 if (!newval || !newlen)
1307 return 0;
1308
1309 if (newlen != sizeof(int))
1310 return -EINVAL;
1311
1312 if (get_user(new, (int __user *)newval))
1313 return -EFAULT;
1314
1315 if (new == *valp)
1316 return 0;
1317
1318 if (oldval && oldlenp) {
1319 size_t len;
1320
1321 if (get_user(len, oldlenp))
1322 return -EFAULT;
1323
1324 if (len) {
1325 if (len > table->maxlen)
1326 len = table->maxlen;
1327 if (copy_to_user(oldval, valp, len))
1328 return -EFAULT;
1329 if (put_user(len, oldlenp))
1330 return -EFAULT;
1331 }
1332 }
1333
1334 *valp = new;
1335
1336 cnf = table->extra1;
1337 net = table->extra2;
1338 i = (int *)table->data - cnf->data;
1339
1340 set_bit(i, cnf->state);
1341
1342 if (cnf == net->ipv4.devconf_dflt)
1343 devinet_copy_dflt_conf(net, i);
1344
1345 return 1;
1346}
1347
1348static int devinet_sysctl_forward(ctl_table *ctl, int write, 1315static int devinet_sysctl_forward(ctl_table *ctl, int write,
1349 void __user *buffer, 1316 void __user *buffer,
1350 size_t *lenp, loff_t *ppos) 1317 size_t *lenp, loff_t *ppos)
1351{ 1318{
1352 int *valp = ctl->data; 1319 int *valp = ctl->data;
1353 int val = *valp; 1320 int val = *valp;
1321 loff_t pos = *ppos;
1354 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 1322 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1355 1323
1356 if (write && *valp != val) { 1324 if (write && *valp != val) {
1357 struct net *net = ctl->extra2; 1325 struct net *net = ctl->extra2;
1358 1326
1359 if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { 1327 if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
1360 if (!rtnl_trylock()) 1328 if (!rtnl_trylock()) {
1329 /* Restore the original values before restarting */
1330 *valp = val;
1331 *ppos = pos;
1361 return restart_syscall(); 1332 return restart_syscall();
1333 }
1362 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { 1334 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
1363 inet_forward_change(net); 1335 inet_forward_change(net);
1364 } else if (*valp) { 1336 } else if (*valp) {
@@ -1390,57 +1362,37 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
1390 return ret; 1362 return ret;
1391} 1363}
1392 1364
1393int ipv4_doint_and_flush_strategy(ctl_table *table, 1365#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
1394 void __user *oldval, size_t __user *oldlenp,
1395 void __user *newval, size_t newlen)
1396{
1397 int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen);
1398 struct net *net = table->extra2;
1399
1400 if (ret == 1)
1401 rt_cache_flush(net, 0);
1402
1403 return ret;
1404}
1405
1406
1407#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \
1408 { \ 1366 { \
1409 .ctl_name = NET_IPV4_CONF_ ## attr, \
1410 .procname = name, \ 1367 .procname = name, \
1411 .data = ipv4_devconf.data + \ 1368 .data = ipv4_devconf.data + \
1412 NET_IPV4_CONF_ ## attr - 1, \ 1369 IPV4_DEVCONF_ ## attr - 1, \
1413 .maxlen = sizeof(int), \ 1370 .maxlen = sizeof(int), \
1414 .mode = mval, \ 1371 .mode = mval, \
1415 .proc_handler = proc, \ 1372 .proc_handler = proc, \
1416 .strategy = sysctl, \
1417 .extra1 = &ipv4_devconf, \ 1373 .extra1 = &ipv4_devconf, \
1418 } 1374 }
1419 1375
1420#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ 1376#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
1421 DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \ 1377 DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
1422 devinet_conf_sysctl)
1423 1378
1424#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ 1379#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
1425 DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \ 1380 DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
1426 devinet_conf_sysctl)
1427 1381
1428#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \ 1382#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
1429 DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl) 1383 DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
1430 1384
1431#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ 1385#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
1432 DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \ 1386 DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
1433 ipv4_doint_and_flush_strategy)
1434 1387
1435static struct devinet_sysctl_table { 1388static struct devinet_sysctl_table {
1436 struct ctl_table_header *sysctl_header; 1389 struct ctl_table_header *sysctl_header;
1437 struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; 1390 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
1438 char *dev_name; 1391 char *dev_name;
1439} devinet_sysctl = { 1392} devinet_sysctl = {
1440 .devinet_vars = { 1393 .devinet_vars = {
1441 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", 1394 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
1442 devinet_sysctl_forward, 1395 devinet_sysctl_forward),
1443 devinet_conf_sysctl),
1444 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), 1396 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
1445 1397
1446 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), 1398 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
@@ -1450,6 +1402,8 @@ static struct devinet_sysctl_table {
1450 DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), 1402 DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
1451 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, 1403 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
1452 "accept_source_route"), 1404 "accept_source_route"),
1405 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
1406 DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
1453 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), 1407 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
1454 DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), 1408 DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
1455 DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), 1409 DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
@@ -1460,6 +1414,7 @@ static struct devinet_sysctl_table {
1460 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), 1414 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
1461 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), 1415 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
1462 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), 1416 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
1417 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
1463 1418
1464 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), 1419 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
1465 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), 1420 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -1471,7 +1426,7 @@ static struct devinet_sysctl_table {
1471}; 1426};
1472 1427
1473static int __devinet_sysctl_register(struct net *net, char *dev_name, 1428static int __devinet_sysctl_register(struct net *net, char *dev_name,
1474 int ctl_name, struct ipv4_devconf *p) 1429 struct ipv4_devconf *p)
1475{ 1430{
1476 int i; 1431 int i;
1477 struct devinet_sysctl_table *t; 1432 struct devinet_sysctl_table *t;
@@ -1479,9 +1434,9 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1479#define DEVINET_CTL_PATH_DEV 3 1434#define DEVINET_CTL_PATH_DEV 3
1480 1435
1481 struct ctl_path devinet_ctl_path[] = { 1436 struct ctl_path devinet_ctl_path[] = {
1482 { .procname = "net", .ctl_name = CTL_NET, }, 1437 { .procname = "net", },
1483 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 1438 { .procname = "ipv4", },
1484 { .procname = "conf", .ctl_name = NET_IPV4_CONF, }, 1439 { .procname = "conf", },
1485 { /* to be set */ }, 1440 { /* to be set */ },
1486 { }, 1441 { },
1487 }; 1442 };
@@ -1506,7 +1461,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1506 goto free; 1461 goto free;
1507 1462
1508 devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; 1463 devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
1509 devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name;
1510 1464
1511 t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, 1465 t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
1512 t->devinet_vars); 1466 t->devinet_vars);
@@ -1539,10 +1493,9 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1539 1493
1540static void devinet_sysctl_register(struct in_device *idev) 1494static void devinet_sysctl_register(struct in_device *idev)
1541{ 1495{
1542 neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, 1496 neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
1543 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
1544 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, 1497 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
1545 idev->dev->ifindex, &idev->cnf); 1498 &idev->cnf);
1546} 1499}
1547 1500
1548static void devinet_sysctl_unregister(struct in_device *idev) 1501static void devinet_sysctl_unregister(struct in_device *idev)
@@ -1553,14 +1506,12 @@ static void devinet_sysctl_unregister(struct in_device *idev)
1553 1506
1554static struct ctl_table ctl_forward_entry[] = { 1507static struct ctl_table ctl_forward_entry[] = {
1555 { 1508 {
1556 .ctl_name = NET_IPV4_FORWARD,
1557 .procname = "ip_forward", 1509 .procname = "ip_forward",
1558 .data = &ipv4_devconf.data[ 1510 .data = &ipv4_devconf.data[
1559 NET_IPV4_CONF_FORWARDING - 1], 1511 IPV4_DEVCONF_FORWARDING - 1],
1560 .maxlen = sizeof(int), 1512 .maxlen = sizeof(int),
1561 .mode = 0644, 1513 .mode = 0644,
1562 .proc_handler = devinet_sysctl_forward, 1514 .proc_handler = devinet_sysctl_forward,
1563 .strategy = devinet_conf_sysctl,
1564 .extra1 = &ipv4_devconf, 1515 .extra1 = &ipv4_devconf,
1565 .extra2 = &init_net, 1516 .extra2 = &init_net,
1566 }, 1517 },
@@ -1568,8 +1519,8 @@ static struct ctl_table ctl_forward_entry[] = {
1568}; 1519};
1569 1520
1570static __net_initdata struct ctl_path net_ipv4_path[] = { 1521static __net_initdata struct ctl_path net_ipv4_path[] = {
1571 { .procname = "net", .ctl_name = CTL_NET, }, 1522 { .procname = "net", },
1572 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 1523 { .procname = "ipv4", },
1573 { }, 1524 { },
1574}; 1525};
1575#endif 1526#endif
@@ -1587,7 +1538,7 @@ static __net_init int devinet_init_net(struct net *net)
1587 all = &ipv4_devconf; 1538 all = &ipv4_devconf;
1588 dflt = &ipv4_devconf_dflt; 1539 dflt = &ipv4_devconf_dflt;
1589 1540
1590 if (net != &init_net) { 1541 if (!net_eq(net, &init_net)) {
1591 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); 1542 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
1592 if (all == NULL) 1543 if (all == NULL)
1593 goto err_alloc_all; 1544 goto err_alloc_all;
@@ -1601,20 +1552,18 @@ static __net_init int devinet_init_net(struct net *net)
1601 if (tbl == NULL) 1552 if (tbl == NULL)
1602 goto err_alloc_ctl; 1553 goto err_alloc_ctl;
1603 1554
1604 tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; 1555 tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
1605 tbl[0].extra1 = all; 1556 tbl[0].extra1 = all;
1606 tbl[0].extra2 = net; 1557 tbl[0].extra2 = net;
1607#endif 1558#endif
1608 } 1559 }
1609 1560
1610#ifdef CONFIG_SYSCTL 1561#ifdef CONFIG_SYSCTL
1611 err = __devinet_sysctl_register(net, "all", 1562 err = __devinet_sysctl_register(net, "all", all);
1612 NET_PROTO_CONF_ALL, all);
1613 if (err < 0) 1563 if (err < 0)
1614 goto err_reg_all; 1564 goto err_reg_all;
1615 1565
1616 err = __devinet_sysctl_register(net, "default", 1566 err = __devinet_sysctl_register(net, "default", dflt);
1617 NET_PROTO_CONF_DEFAULT, dflt);
1618 if (err < 0) 1567 if (err < 0)
1619 goto err_reg_dflt; 1568 goto err_reg_dflt;
1620 1569
@@ -1680,8 +1629,3 @@ void __init devinet_init(void)
1680 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); 1629 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
1681} 1630}
1682 1631
1683EXPORT_SYMBOL(in_dev_finish_destroy);
1684EXPORT_SYMBOL(inet_select_addr);
1685EXPORT_SYMBOL(inetdev_by_index);
1686EXPORT_SYMBOL(register_inetaddr_notifier);
1687EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 12f7287e902d..14ca1f1c3fb0 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -422,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
423 return; 423 return;
424 424
425 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 425 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
426 if (!x) 426 if (!x)
427 return; 427 return;
428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x)
530 } 530 }
531 531
532 err = crypto_aead_setauthsize( 532 err = crypto_aead_setauthsize(
533 aead, aalg_desc->uinfo.auth.icv_truncbits / 8); 533 aead, x->aalg->alg_trunc_len / 8);
534 if (err) 534 if (err)
535 goto free_key; 535 goto free_key;
536 } 536 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index aa00398be80e..4f0ed458c883 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -34,6 +34,7 @@
34#include <linux/skbuff.h> 34#include <linux/skbuff.h>
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/slab.h>
37 38
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/protocol.h> 40#include <net/protocol.h>
@@ -125,7 +126,7 @@ void fib_select_default(struct net *net,
125#endif 126#endif
126 tb = fib_get_table(net, table); 127 tb = fib_get_table(net, table);
127 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
128 tb->tb_select_default(tb, flp, res); 129 fib_table_select_default(tb, flp, res);
129} 130}
130 131
131static void fib_flush(struct net *net) 132static void fib_flush(struct net *net)
@@ -139,7 +140,7 @@ static void fib_flush(struct net *net)
139 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 140 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 head = &net->ipv4.fib_table_hash[h]; 141 head = &net->ipv4.fib_table_hash[h];
141 hlist_for_each_entry(tb, node, head, tb_hlist) 142 hlist_for_each_entry(tb, node, head, tb_hlist)
142 flushed += tb->tb_flush(tb); 143 flushed += fib_table_flush(tb);
143 } 144 }
144 145
145 if (flushed) 146 if (flushed)
@@ -162,7 +163,7 @@ struct net_device * ip_dev_find(struct net *net, __be32 addr)
162#endif 163#endif
163 164
164 local_table = fib_get_table(net, RT_TABLE_LOCAL); 165 local_table = fib_get_table(net, RT_TABLE_LOCAL);
165 if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) 166 if (!local_table || fib_table_lookup(local_table, &fl, &res))
166 return NULL; 167 return NULL;
167 if (res.type != RTN_LOCAL) 168 if (res.type != RTN_LOCAL)
168 goto out; 169 goto out;
@@ -200,7 +201,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
200 local_table = fib_get_table(net, RT_TABLE_LOCAL); 201 local_table = fib_get_table(net, RT_TABLE_LOCAL);
201 if (local_table) { 202 if (local_table) {
202 ret = RTN_UNICAST; 203 ret = RTN_UNICAST;
203 if (!local_table->tb_lookup(local_table, &fl, &res)) { 204 if (!fib_table_lookup(local_table, &fl, &res)) {
204 if (!dev || dev == res.fi->fib_dev) 205 if (!dev || dev == res.fi->fib_dev)
205 ret = res.type; 206 ret = res.type;
206 fib_res_put(&res); 207 fib_res_put(&res);
@@ -241,16 +242,19 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
241 .iif = oif }; 242 .iif = oif };
242 243
243 struct fib_result res; 244 struct fib_result res;
244 int no_addr, rpf; 245 int no_addr, rpf, accept_local;
245 int ret; 246 int ret;
246 struct net *net; 247 struct net *net;
247 248
248 no_addr = rpf = 0; 249 no_addr = rpf = accept_local = 0;
249 rcu_read_lock(); 250 rcu_read_lock();
250 in_dev = __in_dev_get_rcu(dev); 251 in_dev = __in_dev_get_rcu(dev);
251 if (in_dev) { 252 if (in_dev) {
252 no_addr = in_dev->ifa_list == NULL; 253 no_addr = in_dev->ifa_list == NULL;
253 rpf = IN_DEV_RPFILTER(in_dev); 254 rpf = IN_DEV_RPFILTER(in_dev);
255 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
256 if (mark && !IN_DEV_SRC_VMARK(in_dev))
257 fl.mark = 0;
254 } 258 }
255 rcu_read_unlock(); 259 rcu_read_unlock();
256 260
@@ -260,8 +264,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
260 net = dev_net(dev); 264 net = dev_net(dev);
261 if (fib_lookup(net, &fl, &res)) 265 if (fib_lookup(net, &fl, &res))
262 goto last_resort; 266 goto last_resort;
263 if (res.type != RTN_UNICAST) 267 if (res.type != RTN_UNICAST) {
264 goto e_inval_res; 268 if (res.type != RTN_LOCAL || !accept_local)
269 goto e_inval_res;
270 }
265 *spec_dst = FIB_RES_PREFSRC(res); 271 *spec_dst = FIB_RES_PREFSRC(res);
266 fib_combine_itag(itag, &res); 272 fib_combine_itag(itag, &res);
267#ifdef CONFIG_IP_ROUTE_MULTIPATH 273#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -476,13 +482,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
476 if (cmd == SIOCDELRT) { 482 if (cmd == SIOCDELRT) {
477 tb = fib_get_table(net, cfg.fc_table); 483 tb = fib_get_table(net, cfg.fc_table);
478 if (tb) 484 if (tb)
479 err = tb->tb_delete(tb, &cfg); 485 err = fib_table_delete(tb, &cfg);
480 else 486 else
481 err = -ESRCH; 487 err = -ESRCH;
482 } else { 488 } else {
483 tb = fib_new_table(net, cfg.fc_table); 489 tb = fib_new_table(net, cfg.fc_table);
484 if (tb) 490 if (tb)
485 err = tb->tb_insert(tb, &cfg); 491 err = fib_table_insert(tb, &cfg);
486 else 492 else
487 err = -ENOBUFS; 493 err = -ENOBUFS;
488 } 494 }
@@ -597,7 +603,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
597 goto errout; 603 goto errout;
598 } 604 }
599 605
600 err = tb->tb_delete(tb, &cfg); 606 err = fib_table_delete(tb, &cfg);
601errout: 607errout:
602 return err; 608 return err;
603} 609}
@@ -619,7 +625,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
619 goto errout; 625 goto errout;
620 } 626 }
621 627
622 err = tb->tb_insert(tb, &cfg); 628 err = fib_table_insert(tb, &cfg);
623errout: 629errout:
624 return err; 630 return err;
625} 631}
@@ -650,7 +656,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
650 if (dumped) 656 if (dumped)
651 memset(&cb->args[2], 0, sizeof(cb->args) - 657 memset(&cb->args[2], 0, sizeof(cb->args) -
652 2 * sizeof(cb->args[0])); 658 2 * sizeof(cb->args[0]));
653 if (tb->tb_dump(tb, skb, cb) < 0) 659 if (fib_table_dump(tb, skb, cb) < 0)
654 goto out; 660 goto out;
655 dumped = 1; 661 dumped = 1;
656next: 662next:
@@ -704,9 +710,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
704 cfg.fc_scope = RT_SCOPE_HOST; 710 cfg.fc_scope = RT_SCOPE_HOST;
705 711
706 if (cmd == RTM_NEWROUTE) 712 if (cmd == RTM_NEWROUTE)
707 tb->tb_insert(tb, &cfg); 713 fib_table_insert(tb, &cfg);
708 else 714 else
709 tb->tb_delete(tb, &cfg); 715 fib_table_delete(tb, &cfg);
710} 716}
711 717
712void fib_add_ifaddr(struct in_ifaddr *ifa) 718void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -835,7 +841,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
835 local_bh_disable(); 841 local_bh_disable();
836 842
837 frn->tb_id = tb->tb_id; 843 frn->tb_id = tb->tb_id;
838 frn->err = tb->tb_lookup(tb, &fl, &res); 844 frn->err = fib_table_lookup(tb, &fl, &res);
839 845
840 if (!frn->err) { 846 if (!frn->err) {
841 frn->prefixlen = res.prefixlen; 847 frn->prefixlen = res.prefixlen;
@@ -878,7 +884,7 @@ static void nl_fib_input(struct sk_buff *skb)
878 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 884 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
879} 885}
880 886
881static int nl_fib_lookup_init(struct net *net) 887static int __net_init nl_fib_lookup_init(struct net *net)
882{ 888{
883 struct sock *sk; 889 struct sock *sk;
884 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, 890 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
@@ -895,11 +901,11 @@ static void nl_fib_lookup_exit(struct net *net)
895 net->ipv4.fibnl = NULL; 901 net->ipv4.fibnl = NULL;
896} 902}
897 903
898static void fib_disable_ip(struct net_device *dev, int force) 904static void fib_disable_ip(struct net_device *dev, int force, int delay)
899{ 905{
900 if (fib_sync_down_dev(dev, force)) 906 if (fib_sync_down_dev(dev, force))
901 fib_flush(dev_net(dev)); 907 fib_flush(dev_net(dev));
902 rt_cache_flush(dev_net(dev), 0); 908 rt_cache_flush(dev_net(dev), delay);
903 arp_ifdown(dev); 909 arp_ifdown(dev);
904} 910}
905 911
@@ -922,7 +928,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
922 /* Last address was deleted from this interface. 928 /* Last address was deleted from this interface.
923 Disable IP. 929 Disable IP.
924 */ 930 */
925 fib_disable_ip(dev, 1); 931 fib_disable_ip(dev, 1, 0);
926 } else { 932 } else {
927 rt_cache_flush(dev_net(dev), -1); 933 rt_cache_flush(dev_net(dev), -1);
928 } 934 }
@@ -937,7 +943,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
937 struct in_device *in_dev = __in_dev_get_rtnl(dev); 943 struct in_device *in_dev = __in_dev_get_rtnl(dev);
938 944
939 if (event == NETDEV_UNREGISTER) { 945 if (event == NETDEV_UNREGISTER) {
940 fib_disable_ip(dev, 2); 946 fib_disable_ip(dev, 2, -1);
941 return NOTIFY_DONE; 947 return NOTIFY_DONE;
942 } 948 }
943 949
@@ -955,12 +961,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
955 rt_cache_flush(dev_net(dev), -1); 961 rt_cache_flush(dev_net(dev), -1);
956 break; 962 break;
957 case NETDEV_DOWN: 963 case NETDEV_DOWN:
958 fib_disable_ip(dev, 0); 964 fib_disable_ip(dev, 0, 0);
959 break; 965 break;
960 case NETDEV_CHANGEMTU: 966 case NETDEV_CHANGEMTU:
961 case NETDEV_CHANGE: 967 case NETDEV_CHANGE:
962 rt_cache_flush(dev_net(dev), 0); 968 rt_cache_flush(dev_net(dev), 0);
963 break; 969 break;
970 case NETDEV_UNREGISTER_BATCH:
971 rt_cache_flush_batch();
972 break;
964 } 973 }
965 return NOTIFY_DONE; 974 return NOTIFY_DONE;
966} 975}
@@ -996,7 +1005,7 @@ fail:
996 return err; 1005 return err;
997} 1006}
998 1007
999static void __net_exit ip_fib_net_exit(struct net *net) 1008static void ip_fib_net_exit(struct net *net)
1000{ 1009{
1001 unsigned int i; 1010 unsigned int i;
1002 1011
@@ -1012,7 +1021,7 @@ static void __net_exit ip_fib_net_exit(struct net *net)
1012 head = &net->ipv4.fib_table_hash[i]; 1021 head = &net->ipv4.fib_table_hash[i];
1013 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { 1022 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1014 hlist_del(node); 1023 hlist_del(node);
1015 tb->tb_flush(tb); 1024 fib_table_flush(tb);
1016 kfree(tb); 1025 kfree(tb);
1017 } 1026 }
1018 } 1027 }
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ecd39454235c..4ed7e0dea1bc 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -32,6 +32,7 @@
32#include <linux/skbuff.h> 32#include <linux/skbuff.h>
33#include <linux/netlink.h> 33#include <linux/netlink.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/slab.h>
35 36
36#include <net/net_namespace.h> 37#include <net/net_namespace.h>
37#include <net/ip.h> 38#include <net/ip.h>
@@ -242,8 +243,8 @@ fn_new_zone(struct fn_hash *table, int z)
242 return fz; 243 return fz;
243} 244}
244 245
245static int 246int fib_table_lookup(struct fib_table *tb,
246fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 247 const struct flowi *flp, struct fib_result *res)
247{ 248{
248 int err; 249 int err;
249 struct fn_zone *fz; 250 struct fn_zone *fz;
@@ -274,8 +275,8 @@ out:
274 return err; 275 return err;
275} 276}
276 277
277static void 278void fib_table_select_default(struct fib_table *tb,
278fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 279 const struct flowi *flp, struct fib_result *res)
279{ 280{
280 int order, last_idx; 281 int order, last_idx;
281 struct hlist_node *node; 282 struct hlist_node *node;
@@ -366,7 +367,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
366 return NULL; 367 return NULL;
367} 368}
368 369
369static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) 370int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
370{ 371{
371 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 372 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
372 struct fib_node *new_f = NULL; 373 struct fib_node *new_f = NULL;
@@ -544,8 +545,7 @@ out:
544 return err; 545 return err;
545} 546}
546 547
547 548int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
548static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
549{ 549{
550 struct fn_hash *table = (struct fn_hash *)tb->tb_data; 550 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
551 struct fib_node *f; 551 struct fib_node *f;
@@ -662,7 +662,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
662 return found; 662 return found;
663} 663}
664 664
665static int fn_hash_flush(struct fib_table *tb) 665int fib_table_flush(struct fib_table *tb)
666{ 666{
667 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 667 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
668 struct fn_zone *fz; 668 struct fn_zone *fz;
@@ -743,7 +743,8 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
743 return skb->len; 743 return skb->len;
744} 744}
745 745
746static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) 746int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
747 struct netlink_callback *cb)
747{ 748{
748 int m, s_m; 749 int m, s_m;
749 struct fn_zone *fz; 750 struct fn_zone *fz;
@@ -787,12 +788,7 @@ struct fib_table *fib_hash_table(u32 id)
787 788
788 tb->tb_id = id; 789 tb->tb_id = id;
789 tb->tb_default = -1; 790 tb->tb_default = -1;
790 tb->tb_lookup = fn_hash_lookup; 791
791 tb->tb_insert = fn_hash_insert;
792 tb->tb_delete = fn_hash_delete;
793 tb->tb_flush = fn_hash_flush;
794 tb->tb_select_default = fn_hash_select_default;
795 tb->tb_dump = fn_hash_dump;
796 memset(tb->tb_data, 0, sizeof(struct fn_hash)); 792 memset(tb->tb_data, 0, sizeof(struct fn_hash));
797 return tb; 793 return tb;
798} 794}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 92d9d97ec5e3..ca2d07b1c706 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -94,7 +94,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) 94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
95 goto errout; 95 goto errout;
96 96
97 err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); 97 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
98 if (err > 0) 98 if (err > 0)
99 err = -EAGAIN; 99 err = -EAGAIN;
100errout: 100errout:
@@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
284{ 284{
285 int err; 285 int err;
286 286
287 err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); 287 err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
288 if (err < 0) 288 if (err < 0)
289 return err; 289 return err;
290 err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); 290 err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
@@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net)
301 int err; 301 int err;
302 struct fib_rules_ops *ops; 302 struct fib_rules_ops *ops;
303 303
304 ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); 304 ops = fib_rules_register(&fib4_rules_ops_template, net);
305 if (ops == NULL) 305 if (IS_ERR(ops))
306 return -ENOMEM; 306 return PTR_ERR(ops);
307 INIT_LIST_HEAD(&ops->rules_list);
308 ops->fro_net = net;
309
310 fib_rules_register(ops);
311 307
312 err = fib_default_rules_init(ops); 308 err = fib_default_rules_init(ops);
313 if (err < 0) 309 if (err < 0)
@@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net)
318fail: 314fail:
319 /* also cleans all rules already added */ 315 /* also cleans all rules already added */
320 fib_rules_unregister(ops); 316 fib_rules_unregister(ops);
321 kfree(ops);
322 return err; 317 return err;
323} 318}
324 319
325void __net_exit fib4_rules_exit(struct net *net) 320void __net_exit fib4_rules_exit(struct net *net)
326{ 321{
327 fib_rules_unregister(net->ipv4.rules_ops); 322 fib_rules_unregister(net->ipv4.rules_ops);
328 kfree(net->ipv4.rules_ops);
329} 323}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9b096d6ff3f2..20f09c5b31e8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/slab.h>
35 36
36#include <net/arp.h> 37#include <net/arp.h>
37#include <net/ip.h> 38#include <net/ip.h>
@@ -62,8 +63,8 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 65
65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ 66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) 67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
67 68
68#else /* CONFIG_IP_ROUTE_MULTIPATH */ 69#else /* CONFIG_IP_ROUTE_MULTIPATH */
69 70
@@ -72,7 +73,7 @@ for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++,
72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73for (nhsel=0; nhsel < 1; nhsel++) 74for (nhsel=0; nhsel < 1; nhsel++)
74 75
75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \ 76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
76for (nhsel=0; nhsel < 1; nhsel++) 77for (nhsel=0; nhsel < 1; nhsel++)
77 78
78#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 79#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -145,9 +146,9 @@ void free_fib_info(struct fib_info *fi)
145 return; 146 return;
146 } 147 }
147 change_nexthops(fi) { 148 change_nexthops(fi) {
148 if (nh->nh_dev) 149 if (nexthop_nh->nh_dev)
149 dev_put(nh->nh_dev); 150 dev_put(nexthop_nh->nh_dev);
150 nh->nh_dev = NULL; 151 nexthop_nh->nh_dev = NULL;
151 } endfor_nexthops(fi); 152 } endfor_nexthops(fi);
152 fib_info_cnt--; 153 fib_info_cnt--;
153 release_net(fi->fib_net); 154 release_net(fi->fib_net);
@@ -162,9 +163,9 @@ void fib_release_info(struct fib_info *fi)
162 if (fi->fib_prefsrc) 163 if (fi->fib_prefsrc)
163 hlist_del(&fi->fib_lhash); 164 hlist_del(&fi->fib_lhash);
164 change_nexthops(fi) { 165 change_nexthops(fi) {
165 if (!nh->nh_dev) 166 if (!nexthop_nh->nh_dev)
166 continue; 167 continue;
167 hlist_del(&nh->nh_hash); 168 hlist_del(&nexthop_nh->nh_hash);
168 } endfor_nexthops(fi) 169 } endfor_nexthops(fi)
169 fi->fib_dead = 1; 170 fi->fib_dead = 1;
170 fib_info_put(fi); 171 fib_info_put(fi);
@@ -228,7 +229,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
228 head = &fib_info_hash[hash]; 229 head = &fib_info_hash[hash];
229 230
230 hlist_for_each_entry(fi, node, head, fib_hash) { 231 hlist_for_each_entry(fi, node, head, fib_hash) {
231 if (fi->fib_net != nfi->fib_net) 232 if (!net_eq(fi->fib_net, nfi->fib_net))
232 continue; 233 continue;
233 if (fi->fib_nhs != nfi->fib_nhs) 234 if (fi->fib_nhs != nfi->fib_nhs)
234 continue; 235 continue;
@@ -395,19 +396,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
395 if (!rtnh_ok(rtnh, remaining)) 396 if (!rtnh_ok(rtnh, remaining))
396 return -EINVAL; 397 return -EINVAL;
397 398
398 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 399 nexthop_nh->nh_flags =
399 nh->nh_oif = rtnh->rtnh_ifindex; 400 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400 nh->nh_weight = rtnh->rtnh_hops + 1; 401 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
402 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
401 403
402 attrlen = rtnh_attrlen(rtnh); 404 attrlen = rtnh_attrlen(rtnh);
403 if (attrlen > 0) { 405 if (attrlen > 0) {
404 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 406 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405 407
406 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 408 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407 nh->nh_gw = nla ? nla_get_be32(nla) : 0; 409 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408#ifdef CONFIG_NET_CLS_ROUTE 410#ifdef CONFIG_NET_CLS_ROUTE
409 nla = nla_find(attrs, attrlen, RTA_FLOW); 411 nla = nla_find(attrs, attrlen, RTA_FLOW);
410 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 412 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411#endif 413#endif
412 } 414 }
413 415
@@ -527,10 +529,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
527 if (nh->nh_gw) { 529 if (nh->nh_gw) {
528 struct fib_result res; 530 struct fib_result res;
529 531
530#ifdef CONFIG_IP_ROUTE_PERVASIVE
531 if (nh->nh_flags&RTNH_F_PERVASIVE)
532 return 0;
533#endif
534 if (nh->nh_flags&RTNH_F_ONLINK) { 532 if (nh->nh_flags&RTNH_F_ONLINK) {
535 struct net_device *dev; 533 struct net_device *dev;
536 534
@@ -738,7 +736,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
738 736
739 fi->fib_nhs = nhs; 737 fi->fib_nhs = nhs;
740 change_nexthops(fi) { 738 change_nexthops(fi) {
741 nh->nh_parent = fi; 739 nexthop_nh->nh_parent = fi;
742 } endfor_nexthops(fi) 740 } endfor_nexthops(fi)
743 741
744 if (cfg->fc_mx) { 742 if (cfg->fc_mx) {
@@ -808,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
808 goto failure; 806 goto failure;
809 } else { 807 } else {
810 change_nexthops(fi) { 808 change_nexthops(fi) {
811 if ((err = fib_check_nh(cfg, fi, nh)) != 0) 809 if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
812 goto failure; 810 goto failure;
813 } endfor_nexthops(fi) 811 } endfor_nexthops(fi)
814 } 812 }
@@ -843,11 +841,11 @@ link_it:
843 struct hlist_head *head; 841 struct hlist_head *head;
844 unsigned int hash; 842 unsigned int hash;
845 843
846 if (!nh->nh_dev) 844 if (!nexthop_nh->nh_dev)
847 continue; 845 continue;
848 hash = fib_devindex_hashfn(nh->nh_dev->ifindex); 846 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
849 head = &fib_info_devhash[hash]; 847 head = &fib_info_devhash[hash];
850 hlist_add_head(&nh->nh_hash, head); 848 hlist_add_head(&nexthop_nh->nh_hash, head);
851 } endfor_nexthops(fi) 849 } endfor_nexthops(fi)
852 spin_unlock_bh(&fib_info_lock); 850 spin_unlock_bh(&fib_info_lock);
853 return fi; 851 return fi;
@@ -1047,7 +1045,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
1047 return 0; 1045 return 0;
1048 1046
1049 hlist_for_each_entry(fi, node, head, fib_lhash) { 1047 hlist_for_each_entry(fi, node, head, fib_lhash) {
1050 if (fi->fib_net != net) 1048 if (!net_eq(fi->fib_net, net))
1051 continue; 1049 continue;
1052 if (fi->fib_prefsrc == local) { 1050 if (fi->fib_prefsrc == local) {
1053 fi->fib_flags |= RTNH_F_DEAD; 1051 fi->fib_flags |= RTNH_F_DEAD;
@@ -1080,21 +1078,21 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1080 prev_fi = fi; 1078 prev_fi = fi;
1081 dead = 0; 1079 dead = 0;
1082 change_nexthops(fi) { 1080 change_nexthops(fi) {
1083 if (nh->nh_flags&RTNH_F_DEAD) 1081 if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1084 dead++; 1082 dead++;
1085 else if (nh->nh_dev == dev && 1083 else if (nexthop_nh->nh_dev == dev &&
1086 nh->nh_scope != scope) { 1084 nexthop_nh->nh_scope != scope) {
1087 nh->nh_flags |= RTNH_F_DEAD; 1085 nexthop_nh->nh_flags |= RTNH_F_DEAD;
1088#ifdef CONFIG_IP_ROUTE_MULTIPATH 1086#ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 spin_lock_bh(&fib_multipath_lock); 1087 spin_lock_bh(&fib_multipath_lock);
1090 fi->fib_power -= nh->nh_power; 1088 fi->fib_power -= nexthop_nh->nh_power;
1091 nh->nh_power = 0; 1089 nexthop_nh->nh_power = 0;
1092 spin_unlock_bh(&fib_multipath_lock); 1090 spin_unlock_bh(&fib_multipath_lock);
1093#endif 1091#endif
1094 dead++; 1092 dead++;
1095 } 1093 }
1096#ifdef CONFIG_IP_ROUTE_MULTIPATH 1094#ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 if (force > 1 && nh->nh_dev == dev) { 1095 if (force > 1 && nexthop_nh->nh_dev == dev) {
1098 dead = fi->fib_nhs; 1096 dead = fi->fib_nhs;
1099 break; 1097 break;
1100 } 1098 }
@@ -1144,18 +1142,20 @@ int fib_sync_up(struct net_device *dev)
1144 prev_fi = fi; 1142 prev_fi = fi;
1145 alive = 0; 1143 alive = 0;
1146 change_nexthops(fi) { 1144 change_nexthops(fi) {
1147 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1145 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1148 alive++; 1146 alive++;
1149 continue; 1147 continue;
1150 } 1148 }
1151 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) 1149 if (nexthop_nh->nh_dev == NULL ||
1150 !(nexthop_nh->nh_dev->flags&IFF_UP))
1152 continue; 1151 continue;
1153 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) 1152 if (nexthop_nh->nh_dev != dev ||
1153 !__in_dev_get_rtnl(dev))
1154 continue; 1154 continue;
1155 alive++; 1155 alive++;
1156 spin_lock_bh(&fib_multipath_lock); 1156 spin_lock_bh(&fib_multipath_lock);
1157 nh->nh_power = 0; 1157 nexthop_nh->nh_power = 0;
1158 nh->nh_flags &= ~RTNH_F_DEAD; 1158 nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1159 spin_unlock_bh(&fib_multipath_lock); 1159 spin_unlock_bh(&fib_multipath_lock);
1160 } endfor_nexthops(fi) 1160 } endfor_nexthops(fi)
1161 1161
@@ -1182,9 +1182,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 if (fi->fib_power <= 0) { 1182 if (fi->fib_power <= 0) {
1183 int power = 0; 1183 int power = 0;
1184 change_nexthops(fi) { 1184 change_nexthops(fi) {
1185 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1185 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1186 power += nh->nh_weight; 1186 power += nexthop_nh->nh_weight;
1187 nh->nh_power = nh->nh_weight; 1187 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 } 1188 }
1189 } endfor_nexthops(fi); 1189 } endfor_nexthops(fi);
1190 fi->fib_power = power; 1190 fi->fib_power = power;
@@ -1204,9 +1204,10 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1204 w = jiffies % fi->fib_power; 1204 w = jiffies % fi->fib_power;
1205 1205
1206 change_nexthops(fi) { 1206 change_nexthops(fi) {
1207 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { 1207 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1208 if ((w -= nh->nh_power) <= 0) { 1208 nexthop_nh->nh_power) {
1209 nh->nh_power--; 1209 if ((w -= nexthop_nh->nh_power) <= 0) {
1210 nexthop_nh->nh_power--;
1210 fi->fib_power--; 1211 fi->fib_power--;
1211 res->nh_sel = nhsel; 1212 res->nh_sel = nhsel;
1212 spin_unlock_bh(&fib_multipath_lock); 1213 spin_unlock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 291bdf50a21f..c98f115fb0fd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -71,6 +71,7 @@
71#include <linux/netlink.h> 71#include <linux/netlink.h>
72#include <linux/init.h> 72#include <linux/init.h>
73#include <linux/list.h> 73#include <linux/list.h>
74#include <linux/slab.h>
74#include <net/net_namespace.h> 75#include <net/net_namespace.h>
75#include <net/ip.h> 76#include <net/ip.h>
76#include <net/protocol.h> 77#include <net/protocol.h>
@@ -208,7 +209,9 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
208{ 209{
209 struct node *ret = tnode_get_child(tn, i); 210 struct node *ret = tnode_get_child(tn, i);
210 211
211 return rcu_dereference(ret); 212 return rcu_dereference_check(ret,
213 rcu_read_lock_held() ||
214 lockdep_rtnl_is_held());
212} 215}
213 216
214static inline int tnode_child_length(const struct tnode *tn) 217static inline int tnode_child_length(const struct tnode *tn)
@@ -961,7 +964,9 @@ fib_find_node(struct trie *t, u32 key)
961 struct node *n; 964 struct node *n;
962 965
963 pos = 0; 966 pos = 0;
964 n = rcu_dereference(t->trie); 967 n = rcu_dereference_check(t->trie,
968 rcu_read_lock_held() ||
969 lockdep_rtnl_is_held());
965 970
966 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 971 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
967 tn = (struct tnode *) n; 972 tn = (struct tnode *) n;
@@ -1174,7 +1179,7 @@ done:
1174/* 1179/*
1175 * Caller must hold RTNL. 1180 * Caller must hold RTNL.
1176 */ 1181 */
1177static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) 1182int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1178{ 1183{
1179 struct trie *t = (struct trie *) tb->tb_data; 1184 struct trie *t = (struct trie *) tb->tb_data;
1180 struct fib_alias *fa, *new_fa; 1185 struct fib_alias *fa, *new_fa;
@@ -1373,8 +1378,8 @@ static int check_leaf(struct trie *t, struct leaf *l,
1373 return 1; 1378 return 1;
1374} 1379}
1375 1380
1376static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, 1381int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1377 struct fib_result *res) 1382 struct fib_result *res)
1378{ 1383{
1379 struct trie *t = (struct trie *) tb->tb_data; 1384 struct trie *t = (struct trie *) tb->tb_data;
1380 int ret; 1385 int ret;
@@ -1595,7 +1600,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1595/* 1600/*
1596 * Caller must hold RTNL. 1601 * Caller must hold RTNL.
1597 */ 1602 */
1598static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) 1603int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1599{ 1604{
1600 struct trie *t = (struct trie *) tb->tb_data; 1605 struct trie *t = (struct trie *) tb->tb_data;
1601 u32 key, mask; 1606 u32 key, mask;
@@ -1786,7 +1791,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
1786/* 1791/*
1787 * Caller must hold RTNL. 1792 * Caller must hold RTNL.
1788 */ 1793 */
1789static int fn_trie_flush(struct fib_table *tb) 1794int fib_table_flush(struct fib_table *tb)
1790{ 1795{
1791 struct trie *t = (struct trie *) tb->tb_data; 1796 struct trie *t = (struct trie *) tb->tb_data;
1792 struct leaf *l, *ll = NULL; 1797 struct leaf *l, *ll = NULL;
@@ -1807,9 +1812,9 @@ static int fn_trie_flush(struct fib_table *tb)
1807 return found; 1812 return found;
1808} 1813}
1809 1814
1810static void fn_trie_select_default(struct fib_table *tb, 1815void fib_table_select_default(struct fib_table *tb,
1811 const struct flowi *flp, 1816 const struct flowi *flp,
1812 struct fib_result *res) 1817 struct fib_result *res)
1813{ 1818{
1814 struct trie *t = (struct trie *) tb->tb_data; 1819 struct trie *t = (struct trie *) tb->tb_data;
1815 int order, last_idx; 1820 int order, last_idx;
@@ -1952,8 +1957,8 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1952 return skb->len; 1957 return skb->len;
1953} 1958}
1954 1959
1955static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, 1960int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1956 struct netlink_callback *cb) 1961 struct netlink_callback *cb)
1957{ 1962{
1958 struct leaf *l; 1963 struct leaf *l;
1959 struct trie *t = (struct trie *) tb->tb_data; 1964 struct trie *t = (struct trie *) tb->tb_data;
@@ -2020,12 +2025,6 @@ struct fib_table *fib_hash_table(u32 id)
2020 2025
2021 tb->tb_id = id; 2026 tb->tb_id = id;
2022 tb->tb_default = -1; 2027 tb->tb_default = -1;
2023 tb->tb_lookup = fn_trie_lookup;
2024 tb->tb_insert = fn_trie_insert;
2025 tb->tb_delete = fn_trie_delete;
2026 tb->tb_flush = fn_trie_flush;
2027 tb->tb_select_default = fn_trie_select_default;
2028 tb->tb_dump = fn_trie_dump;
2029 2028
2030 t = (struct trie *) tb->tb_data; 2029 t = (struct trie *) tb->tb_data;
2031 memset(t, 0, sizeof(*t)); 2030 memset(t, 0, sizeof(*t));
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5bc13fe816d1..ac4dec132735 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -74,6 +74,7 @@
74#include <linux/netdevice.h> 74#include <linux/netdevice.h>
75#include <linux/string.h> 75#include <linux/string.h>
76#include <linux/netfilter_ipv4.h> 76#include <linux/netfilter_ipv4.h>
77#include <linux/slab.h>
77#include <net/snmp.h> 78#include <net/snmp.h>
78#include <net/ip.h> 79#include <net/ip.h>
79#include <net/route.h> 80#include <net/route.h>
@@ -114,7 +115,7 @@ struct icmp_bxm {
114/* An array of errno for error messages from dest unreach. */ 115/* An array of errno for error messages from dest unreach. */
115/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ 116/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
116 117
117struct icmp_err icmp_err_convert[] = { 118const struct icmp_err icmp_err_convert[] = {
118 { 119 {
119 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ 120 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
120 .fatal = 0, 121 .fatal = 0,
@@ -501,15 +502,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
501 if (!(rt->rt_flags & RTCF_LOCAL)) { 502 if (!(rt->rt_flags & RTCF_LOCAL)) {
502 struct net_device *dev = NULL; 503 struct net_device *dev = NULL;
503 504
505 rcu_read_lock();
504 if (rt->fl.iif && 506 if (rt->fl.iif &&
505 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 507 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
506 dev = dev_get_by_index(net, rt->fl.iif); 508 dev = dev_get_by_index_rcu(net, rt->fl.iif);
507 509
508 if (dev) { 510 if (dev)
509 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 511 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
510 dev_put(dev); 512 else
511 } else
512 saddr = 0; 513 saddr = 0;
514 rcu_read_unlock();
513 } 515 }
514 516
515 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 517 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
@@ -1165,6 +1167,10 @@ static int __net_init icmp_sk_init(struct net *net)
1165 sk->sk_sndbuf = 1167 sk->sk_sndbuf =
1166 (2 * ((64 * 1024) + sizeof(struct sk_buff))); 1168 (2 * ((64 * 1024) + sizeof(struct sk_buff)));
1167 1169
1170 /*
1171 * Speedup sock_wfree()
1172 */
1173 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1168 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; 1174 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
1169 } 1175 }
1170 1176
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d41e5de79a82..15d3eeda92f5 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -71,6 +71,7 @@
71 */ 71 */
72 72
73#include <linux/module.h> 73#include <linux/module.h>
74#include <linux/slab.h>
74#include <asm/uaccess.h> 75#include <asm/uaccess.h>
75#include <asm/system.h> 76#include <asm/system.h>
76#include <linux/types.h> 77#include <linux/types.h>
@@ -946,7 +947,6 @@ int igmp_rcv(struct sk_buff *skb)
946 break; 947 break;
947 case IGMP_HOST_MEMBERSHIP_REPORT: 948 case IGMP_HOST_MEMBERSHIP_REPORT:
948 case IGMPV2_HOST_MEMBERSHIP_REPORT: 949 case IGMPV2_HOST_MEMBERSHIP_REPORT:
949 case IGMPV3_HOST_MEMBERSHIP_REPORT:
950 /* Is it our report looped back? */ 950 /* Is it our report looped back? */
951 if (skb_rtable(skb)->fl.iif == 0) 951 if (skb_rtable(skb)->fl.iif == 0)
952 break; 952 break;
@@ -960,6 +960,7 @@ int igmp_rcv(struct sk_buff *skb)
960 in_dev_put(in_dev); 960 in_dev_put(in_dev);
961 return pim_rcv_v1(skb); 961 return pim_rcv_v1(skb);
962#endif 962#endif
963 case IGMPV3_HOST_MEMBERSHIP_REPORT:
963 case IGMP_DVMRP: 964 case IGMP_DVMRP:
964 case IGMP_TRACE: 965 case IGMP_TRACE:
965 case IGMP_HOST_LEAVE_MESSAGE: 966 case IGMP_HOST_LEAVE_MESSAGE:
@@ -1799,7 +1800,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1799 iml->next = inet->mc_list; 1800 iml->next = inet->mc_list;
1800 iml->sflist = NULL; 1801 iml->sflist = NULL;
1801 iml->sfmode = MCAST_EXCLUDE; 1802 iml->sfmode = MCAST_EXCLUDE;
1802 inet->mc_list = iml; 1803 rcu_assign_pointer(inet->mc_list, iml);
1803 ip_mc_inc_group(in_dev, addr); 1804 ip_mc_inc_group(in_dev, addr);
1804 err = 0; 1805 err = 0;
1805done: 1806done:
@@ -1807,24 +1808,46 @@ done:
1807 return err; 1808 return err;
1808} 1809}
1809 1810
1811static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1812{
1813 struct ip_sf_socklist *psf;
1814
1815 psf = container_of(rp, struct ip_sf_socklist, rcu);
1816 /* sk_omem_alloc should have been decreased by the caller*/
1817 kfree(psf);
1818}
1819
1810static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 1820static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1811 struct in_device *in_dev) 1821 struct in_device *in_dev)
1812{ 1822{
1823 struct ip_sf_socklist *psf = iml->sflist;
1813 int err; 1824 int err;
1814 1825
1815 if (iml->sflist == NULL) { 1826 if (psf == NULL) {
1816 /* any-source empty exclude case */ 1827 /* any-source empty exclude case */
1817 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1828 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1818 iml->sfmode, 0, NULL, 0); 1829 iml->sfmode, 0, NULL, 0);
1819 } 1830 }
1820 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1831 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1821 iml->sfmode, iml->sflist->sl_count, 1832 iml->sfmode, psf->sl_count, psf->sl_addr, 0);
1822 iml->sflist->sl_addr, 0); 1833 rcu_assign_pointer(iml->sflist, NULL);
1823 sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); 1834 /* decrease mem now to avoid the memleak warning */
1824 iml->sflist = NULL; 1835 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
1836 call_rcu(&psf->rcu, ip_sf_socklist_reclaim);
1825 return err; 1837 return err;
1826} 1838}
1827 1839
1840
1841static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1842{
1843 struct ip_mc_socklist *iml;
1844
1845 iml = container_of(rp, struct ip_mc_socklist, rcu);
1846 /* sk_omem_alloc should have been decreased by the caller*/
1847 kfree(iml);
1848}
1849
1850
1828/* 1851/*
1829 * Ask a socket to leave a group. 1852 * Ask a socket to leave a group.
1830 */ 1853 */
@@ -1854,12 +1877,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1854 1877
1855 (void) ip_mc_leave_src(sk, iml, in_dev); 1878 (void) ip_mc_leave_src(sk, iml, in_dev);
1856 1879
1857 *imlp = iml->next; 1880 rcu_assign_pointer(*imlp, iml->next);
1858 1881
1859 if (in_dev) 1882 if (in_dev)
1860 ip_mc_dec_group(in_dev, group); 1883 ip_mc_dec_group(in_dev, group);
1861 rtnl_unlock(); 1884 rtnl_unlock();
1862 sock_kfree_s(sk, iml, sizeof(*iml)); 1885 /* decrease mem now to avoid the memleak warning */
1886 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
1887 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
1863 return 0; 1888 return 0;
1864 } 1889 }
1865 if (!in_dev) 1890 if (!in_dev)
@@ -1899,8 +1924,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1899 err = -EADDRNOTAVAIL; 1924 err = -EADDRNOTAVAIL;
1900 1925
1901 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1926 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1902 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr 1927 if ((pmc->multi.imr_multiaddr.s_addr ==
1903 && pmc->multi.imr_ifindex == imr.imr_ifindex) 1928 imr.imr_multiaddr.s_addr) &&
1929 (pmc->multi.imr_ifindex == imr.imr_ifindex))
1904 break; 1930 break;
1905 } 1931 }
1906 if (!pmc) { /* must have a prior join */ 1932 if (!pmc) { /* must have a prior join */
@@ -1973,9 +1999,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1973 if (psl) { 1999 if (psl) {
1974 for (i=0; i<psl->sl_count; i++) 2000 for (i=0; i<psl->sl_count; i++)
1975 newpsl->sl_addr[i] = psl->sl_addr[i]; 2001 newpsl->sl_addr[i] = psl->sl_addr[i];
1976 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); 2002 /* decrease mem now to avoid the memleak warning */
2003 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2004 call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
1977 } 2005 }
1978 pmc->sflist = psl = newpsl; 2006 rcu_assign_pointer(pmc->sflist, newpsl);
2007 psl = newpsl;
1979 } 2008 }
1980 rv = 1; /* > 0 for insert logic below if sl_count is 0 */ 2009 rv = 1; /* > 0 for insert logic below if sl_count is 0 */
1981 for (i=0; i<psl->sl_count; i++) { 2010 for (i=0; i<psl->sl_count; i++) {
@@ -2071,11 +2100,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2071 if (psl) { 2100 if (psl) {
2072 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2101 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2073 psl->sl_count, psl->sl_addr, 0); 2102 psl->sl_count, psl->sl_addr, 0);
2074 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); 2103 /* decrease mem now to avoid the memleak warning */
2104 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2105 call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
2075 } else 2106 } else
2076 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2107 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2077 0, NULL, 0); 2108 0, NULL, 0);
2078 pmc->sflist = newpsl; 2109 rcu_assign_pointer(pmc->sflist, newpsl);
2079 pmc->sfmode = msf->imsf_fmode; 2110 pmc->sfmode = msf->imsf_fmode;
2080 err = 0; 2111 err = 0;
2081done: 2112done:
@@ -2208,30 +2239,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2208 struct ip_mc_socklist *pmc; 2239 struct ip_mc_socklist *pmc;
2209 struct ip_sf_socklist *psl; 2240 struct ip_sf_socklist *psl;
2210 int i; 2241 int i;
2242 int ret;
2211 2243
2244 ret = 1;
2212 if (!ipv4_is_multicast(loc_addr)) 2245 if (!ipv4_is_multicast(loc_addr))
2213 return 1; 2246 goto out;
2214 2247
2215 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2248 rcu_read_lock();
2249 for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) {
2216 if (pmc->multi.imr_multiaddr.s_addr == loc_addr && 2250 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2217 pmc->multi.imr_ifindex == dif) 2251 pmc->multi.imr_ifindex == dif)
2218 break; 2252 break;
2219 } 2253 }
2254 ret = inet->mc_all;
2220 if (!pmc) 2255 if (!pmc)
2221 return inet->mc_all; 2256 goto unlock;
2222 psl = pmc->sflist; 2257 psl = pmc->sflist;
2258 ret = (pmc->sfmode == MCAST_EXCLUDE);
2223 if (!psl) 2259 if (!psl)
2224 return pmc->sfmode == MCAST_EXCLUDE; 2260 goto unlock;
2225 2261
2226 for (i=0; i<psl->sl_count; i++) { 2262 for (i=0; i<psl->sl_count; i++) {
2227 if (psl->sl_addr[i] == rmt_addr) 2263 if (psl->sl_addr[i] == rmt_addr)
2228 break; 2264 break;
2229 } 2265 }
2266 ret = 0;
2230 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) 2267 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
2231 return 0; 2268 goto unlock;
2232 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) 2269 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
2233 return 0; 2270 goto unlock;
2234 return 1; 2271 ret = 1;
2272unlock:
2273 rcu_read_unlock();
2274out:
2275 return ret;
2235} 2276}
2236 2277
2237/* 2278/*
@@ -2250,7 +2291,7 @@ void ip_mc_drop_socket(struct sock *sk)
2250 rtnl_lock(); 2291 rtnl_lock();
2251 while ((iml = inet->mc_list) != NULL) { 2292 while ((iml = inet->mc_list) != NULL) {
2252 struct in_device *in_dev; 2293 struct in_device *in_dev;
2253 inet->mc_list = iml->next; 2294 rcu_assign_pointer(inet->mc_list, iml->next);
2254 2295
2255 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2296 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2256 (void) ip_mc_leave_src(sk, iml, in_dev); 2297 (void) ip_mc_leave_src(sk, iml, in_dev);
@@ -2258,7 +2299,9 @@ void ip_mc_drop_socket(struct sock *sk)
2258 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); 2299 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2259 in_dev_put(in_dev); 2300 in_dev_put(in_dev);
2260 } 2301 }
2261 sock_kfree_s(sk, iml, sizeof(*iml)); 2302 /* decrease mem now to avoid the memleak warning */
2303 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
2304 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
2262 } 2305 }
2263 rtnl_unlock(); 2306 rtnl_unlock();
2264} 2307}
@@ -2311,9 +2354,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2311 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2354 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2312 2355
2313 state->in_dev = NULL; 2356 state->in_dev = NULL;
2314 for_each_netdev(net, state->dev) { 2357 for_each_netdev_rcu(net, state->dev) {
2315 struct in_device *in_dev; 2358 struct in_device *in_dev;
2316 in_dev = in_dev_get(state->dev); 2359
2360 in_dev = __in_dev_get_rcu(state->dev);
2317 if (!in_dev) 2361 if (!in_dev)
2318 continue; 2362 continue;
2319 read_lock(&in_dev->mc_list_lock); 2363 read_lock(&in_dev->mc_list_lock);
@@ -2323,7 +2367,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2323 break; 2367 break;
2324 } 2368 }
2325 read_unlock(&in_dev->mc_list_lock); 2369 read_unlock(&in_dev->mc_list_lock);
2326 in_dev_put(in_dev);
2327 } 2370 }
2328 return im; 2371 return im;
2329} 2372}
@@ -2333,16 +2376,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
2333 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2376 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2334 im = im->next; 2377 im = im->next;
2335 while (!im) { 2378 while (!im) {
2336 if (likely(state->in_dev != NULL)) { 2379 if (likely(state->in_dev != NULL))
2337 read_unlock(&state->in_dev->mc_list_lock); 2380 read_unlock(&state->in_dev->mc_list_lock);
2338 in_dev_put(state->in_dev); 2381
2339 } 2382 state->dev = next_net_device_rcu(state->dev);
2340 state->dev = next_net_device(state->dev);
2341 if (!state->dev) { 2383 if (!state->dev) {
2342 state->in_dev = NULL; 2384 state->in_dev = NULL;
2343 break; 2385 break;
2344 } 2386 }
2345 state->in_dev = in_dev_get(state->dev); 2387 state->in_dev = __in_dev_get_rcu(state->dev);
2346 if (!state->in_dev) 2388 if (!state->in_dev)
2347 continue; 2389 continue;
2348 read_lock(&state->in_dev->mc_list_lock); 2390 read_lock(&state->in_dev->mc_list_lock);
@@ -2361,9 +2403,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
2361} 2403}
2362 2404
2363static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) 2405static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
2364 __acquires(dev_base_lock) 2406 __acquires(rcu)
2365{ 2407{
2366 read_lock(&dev_base_lock); 2408 rcu_read_lock();
2367 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2409 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2368} 2410}
2369 2411
@@ -2379,16 +2421,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2379} 2421}
2380 2422
2381static void igmp_mc_seq_stop(struct seq_file *seq, void *v) 2423static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2382 __releases(dev_base_lock) 2424 __releases(rcu)
2383{ 2425{
2384 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2426 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2385 if (likely(state->in_dev != NULL)) { 2427 if (likely(state->in_dev != NULL)) {
2386 read_unlock(&state->in_dev->mc_list_lock); 2428 read_unlock(&state->in_dev->mc_list_lock);
2387 in_dev_put(state->in_dev);
2388 state->in_dev = NULL; 2429 state->in_dev = NULL;
2389 } 2430 }
2390 state->dev = NULL; 2431 state->dev = NULL;
2391 read_unlock(&dev_base_lock); 2432 rcu_read_unlock();
2392} 2433}
2393 2434
2394static int igmp_mc_seq_show(struct seq_file *seq, void *v) 2435static int igmp_mc_seq_show(struct seq_file *seq, void *v)
@@ -2462,9 +2503,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2462 2503
2463 state->idev = NULL; 2504 state->idev = NULL;
2464 state->im = NULL; 2505 state->im = NULL;
2465 for_each_netdev(net, state->dev) { 2506 for_each_netdev_rcu(net, state->dev) {
2466 struct in_device *idev; 2507 struct in_device *idev;
2467 idev = in_dev_get(state->dev); 2508 idev = __in_dev_get_rcu(state->dev);
2468 if (unlikely(idev == NULL)) 2509 if (unlikely(idev == NULL))
2469 continue; 2510 continue;
2470 read_lock(&idev->mc_list_lock); 2511 read_lock(&idev->mc_list_lock);
@@ -2480,7 +2521,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2480 spin_unlock_bh(&im->lock); 2521 spin_unlock_bh(&im->lock);
2481 } 2522 }
2482 read_unlock(&idev->mc_list_lock); 2523 read_unlock(&idev->mc_list_lock);
2483 in_dev_put(idev);
2484 } 2524 }
2485 return psf; 2525 return psf;
2486} 2526}
@@ -2494,16 +2534,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2494 spin_unlock_bh(&state->im->lock); 2534 spin_unlock_bh(&state->im->lock);
2495 state->im = state->im->next; 2535 state->im = state->im->next;
2496 while (!state->im) { 2536 while (!state->im) {
2497 if (likely(state->idev != NULL)) { 2537 if (likely(state->idev != NULL))
2498 read_unlock(&state->idev->mc_list_lock); 2538 read_unlock(&state->idev->mc_list_lock);
2499 in_dev_put(state->idev); 2539
2500 } 2540 state->dev = next_net_device_rcu(state->dev);
2501 state->dev = next_net_device(state->dev);
2502 if (!state->dev) { 2541 if (!state->dev) {
2503 state->idev = NULL; 2542 state->idev = NULL;
2504 goto out; 2543 goto out;
2505 } 2544 }
2506 state->idev = in_dev_get(state->dev); 2545 state->idev = __in_dev_get_rcu(state->dev);
2507 if (!state->idev) 2546 if (!state->idev)
2508 continue; 2547 continue;
2509 read_lock(&state->idev->mc_list_lock); 2548 read_lock(&state->idev->mc_list_lock);
@@ -2528,8 +2567,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
2528} 2567}
2529 2568
2530static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) 2569static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
2570 __acquires(rcu)
2531{ 2571{
2532 read_lock(&dev_base_lock); 2572 rcu_read_lock();
2533 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2573 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2534} 2574}
2535 2575
@@ -2545,6 +2585,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2545} 2585}
2546 2586
2547static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) 2587static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2588 __releases(rcu)
2548{ 2589{
2549 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); 2590 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2550 if (likely(state->im != NULL)) { 2591 if (likely(state->im != NULL)) {
@@ -2553,11 +2594,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2553 } 2594 }
2554 if (likely(state->idev != NULL)) { 2595 if (likely(state->idev != NULL)) {
2555 read_unlock(&state->idev->mc_list_lock); 2596 read_unlock(&state->idev->mc_list_lock);
2556 in_dev_put(state->idev);
2557 state->idev = NULL; 2597 state->idev = NULL;
2558 } 2598 }
2559 state->dev = NULL; 2599 state->dev = NULL;
2560 read_unlock(&dev_base_lock); 2600 rcu_read_unlock();
2561} 2601}
2562 2602
2563static int igmp_mcf_seq_show(struct seq_file *seq, void *v) 2603static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
@@ -2605,7 +2645,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
2605 .release = seq_release_net, 2645 .release = seq_release_net,
2606}; 2646};
2607 2647
2608static int igmp_net_init(struct net *net) 2648static int __net_init igmp_net_init(struct net *net)
2609{ 2649{
2610 struct proc_dir_entry *pde; 2650 struct proc_dir_entry *pde;
2611 2651
@@ -2623,7 +2663,7 @@ out_igmp:
2623 return -ENOMEM; 2663 return -ENOMEM;
2624} 2664}
2625 2665
2626static void igmp_net_exit(struct net *net) 2666static void __net_exit igmp_net_exit(struct net *net)
2627{ 2667{
2628 proc_net_remove(net, "mcfilter"); 2668 proc_net_remove(net, "mcfilter");
2629 proc_net_remove(net, "igmp"); 2669 proc_net_remove(net, "igmp");
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 537731b3bcb3..8da6429269dd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -112,7 +112,7 @@ again:
112 hashinfo->bhash_size)]; 112 hashinfo->bhash_size)];
113 spin_lock(&head->lock); 113 spin_lock(&head->lock);
114 inet_bind_bucket_for_each(tb, node, &head->chain) 114 inet_bind_bucket_for_each(tb, node, &head->chain)
115 if (ib_net(tb) == net && tb->port == rover) { 115 if (net_eq(ib_net(tb), net) && tb->port == rover) {
116 if (tb->fastreuse > 0 && 116 if (tb->fastreuse > 0 &&
117 sk->sk_reuse && 117 sk->sk_reuse &&
118 sk->sk_state != TCP_LISTEN && 118 sk->sk_state != TCP_LISTEN &&
@@ -158,7 +158,7 @@ have_snum:
158 hashinfo->bhash_size)]; 158 hashinfo->bhash_size)];
159 spin_lock(&head->lock); 159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain) 160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (ib_net(tb) == net && tb->port == snum) 161 if (net_eq(ib_net(tb), net) && tb->port == snum)
162 goto tb_found; 162 goto tb_found;
163 } 163 }
164 tb = NULL; 164 tb = NULL;
@@ -358,6 +358,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
358 const struct inet_request_sock *ireq = inet_rsk(req); 358 const struct inet_request_sock *ireq = inet_rsk(req);
359 struct ip_options *opt = inet_rsk(req)->opt; 359 struct ip_options *opt = inet_rsk(req)->opt;
360 struct flowi fl = { .oif = sk->sk_bound_dev_if, 360 struct flowi fl = { .oif = sk->sk_bound_dev_if,
361 .mark = sk->sk_mark,
361 .nl_u = { .ip4_u = 362 .nl_u = { .ip4_u =
362 { .daddr = ((opt && opt->srr) ? 363 { .daddr = ((opt && opt->srr) ?
363 opt->faddr : 364 opt->faddr :
@@ -367,7 +368,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
367 .proto = sk->sk_protocol, 368 .proto = sk->sk_protocol,
368 .flags = inet_sk_flowi_flags(sk), 369 .flags = inet_sk_flowi_flags(sk),
369 .uli_u = { .ports = 370 .uli_u = { .ports =
370 { .sport = inet_sk(sk)->sport, 371 { .sport = inet_sk(sk)->inet_sport,
371 .dport = ireq->rmt_port } } }; 372 .dport = ireq->rmt_port } } };
372 struct net *net = sock_net(sk); 373 struct net *net = sock_net(sk);
373 374
@@ -528,9 +529,11 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
528 syn_ack_recalc(req, thresh, max_retries, 529 syn_ack_recalc(req, thresh, max_retries,
529 queue->rskq_defer_accept, 530 queue->rskq_defer_accept,
530 &expire, &resend); 531 &expire, &resend);
532 if (req->rsk_ops->syn_ack_timeout)
533 req->rsk_ops->syn_ack_timeout(parent, req);
531 if (!expire && 534 if (!expire &&
532 (!resend || 535 (!resend ||
533 !req->rsk_ops->rtx_syn_ack(parent, req) || 536 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
534 inet_rsk(req)->acked)) { 537 inet_rsk(req)->acked)) {
535 unsigned long timeo; 538 unsigned long timeo;
536 539
@@ -574,9 +577,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
574 newsk->sk_state = TCP_SYN_RECV; 577 newsk->sk_state = TCP_SYN_RECV;
575 newicsk->icsk_bind_hash = NULL; 578 newicsk->icsk_bind_hash = NULL;
576 579
577 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 580 inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
578 inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); 581 inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
579 inet_sk(newsk)->sport = inet_rsk(req)->loc_port; 582 inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
580 newsk->sk_write_space = sk_stream_write_space; 583 newsk->sk_write_space = sk_stream_write_space;
581 584
582 newicsk->icsk_retransmits = 0; 585 newicsk->icsk_retransmits = 0;
@@ -607,8 +610,8 @@ void inet_csk_destroy_sock(struct sock *sk)
607 /* It cannot be in hash table! */ 610 /* It cannot be in hash table! */
608 WARN_ON(!sk_unhashed(sk)); 611 WARN_ON(!sk_unhashed(sk));
609 612
610 /* If it has not 0 inet_sk(sk)->num, it must be bound */ 613 /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
611 WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash); 614 WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
612 615
613 sk->sk_prot->destroy(sk); 616 sk->sk_prot->destroy(sk);
614 617
@@ -643,8 +646,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
643 * after validation is complete. 646 * after validation is complete.
644 */ 647 */
645 sk->sk_state = TCP_LISTEN; 648 sk->sk_state = TCP_LISTEN;
646 if (!sk->sk_prot->get_port(sk, inet->num)) { 649 if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
647 inet->sport = htons(inet->num); 650 inet->inet_sport = htons(inet->inet_num);
648 651
649 sk_dst_reset(sk); 652 sk_dst_reset(sk);
650 sk->sk_prot->hash(sk); 653 sk->sk_prot->hash(sk);
@@ -720,8 +723,8 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
720 const struct inet_sock *inet = inet_sk(sk); 723 const struct inet_sock *inet = inet_sk(sk);
721 724
722 sin->sin_family = AF_INET; 725 sin->sin_family = AF_INET;
723 sin->sin_addr.s_addr = inet->daddr; 726 sin->sin_addr.s_addr = inet->inet_daddr;
724 sin->sin_port = inet->dport; 727 sin->sin_port = inet->inet_dport;
725} 728}
726 729
727EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 730EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a706a47f4dbb..e5fa2ddce320 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -14,6 +14,7 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/random.h> 16#include <linux/random.h>
17#include <linux/slab.h>
17#include <linux/cache.h> 18#include <linux/cache.h>
18#include <linux/init.h> 19#include <linux/init.h>
19#include <linux/time.h> 20#include <linux/time.h>
@@ -116,10 +117,10 @@ static int inet_csk_diag_fill(struct sock *sk,
116 r->id.idiag_cookie[0] = (u32)(unsigned long)sk; 117 r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
117 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); 118 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
118 119
119 r->id.idiag_sport = inet->sport; 120 r->id.idiag_sport = inet->inet_sport;
120 r->id.idiag_dport = inet->dport; 121 r->id.idiag_dport = inet->inet_dport;
121 r->id.idiag_src[0] = inet->rcv_saddr; 122 r->id.idiag_src[0] = inet->inet_rcv_saddr;
122 r->id.idiag_dst[0] = inet->daddr; 123 r->id.idiag_dst[0] = inet->inet_daddr;
123 124
124#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 125#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
125 if (r->idiag_family == AF_INET6) { 126 if (r->idiag_family == AF_INET6) {
@@ -368,7 +369,7 @@ static int inet_diag_bc_run(const void *bc, int len,
368 yes = entry->sport >= op[1].no; 369 yes = entry->sport >= op[1].no;
369 break; 370 break;
370 case INET_DIAG_BC_S_LE: 371 case INET_DIAG_BC_S_LE:
371 yes = entry->dport <= op[1].no; 372 yes = entry->sport <= op[1].no;
372 break; 373 break;
373 case INET_DIAG_BC_D_GE: 374 case INET_DIAG_BC_D_GE:
374 yes = entry->dport >= op[1].no; 375 yes = entry->dport >= op[1].no;
@@ -504,11 +505,11 @@ static int inet_csk_diag_dump(struct sock *sk,
504 } else 505 } else
505#endif 506#endif
506 { 507 {
507 entry.saddr = &inet->rcv_saddr; 508 entry.saddr = &inet->inet_rcv_saddr;
508 entry.daddr = &inet->daddr; 509 entry.daddr = &inet->inet_daddr;
509 } 510 }
510 entry.sport = inet->num; 511 entry.sport = inet->inet_num;
511 entry.dport = ntohs(inet->dport); 512 entry.dport = ntohs(inet->inet_dport);
512 entry.userlocks = sk->sk_userlocks; 513 entry.userlocks = sk->sk_userlocks;
513 514
514 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) 515 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
@@ -584,7 +585,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
584 if (tmo < 0) 585 if (tmo < 0)
585 tmo = 0; 586 tmo = 0;
586 587
587 r->id.idiag_sport = inet->sport; 588 r->id.idiag_sport = inet->inet_sport;
588 r->id.idiag_dport = ireq->rmt_port; 589 r->id.idiag_dport = ireq->rmt_port;
589 r->id.idiag_src[0] = ireq->loc_addr; 590 r->id.idiag_src[0] = ireq->loc_addr;
590 r->id.idiag_dst[0] = ireq->rmt_addr; 591 r->id.idiag_dst[0] = ireq->rmt_addr;
@@ -639,7 +640,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
639 640
640 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 641 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
641 bc = (struct rtattr *)(r + 1); 642 bc = (struct rtattr *)(r + 1);
642 entry.sport = inet->num; 643 entry.sport = inet->inet_num;
643 entry.userlocks = sk->sk_userlocks; 644 entry.userlocks = sk->sk_userlocks;
644 } 645 }
645 646
@@ -732,7 +733,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
732 continue; 733 continue;
733 } 734 }
734 735
735 if (r->id.idiag_sport != inet->sport && 736 if (r->id.idiag_sport != inet->inet_sport &&
736 r->id.idiag_sport) 737 r->id.idiag_sport)
737 goto next_listen; 738 goto next_listen;
738 739
@@ -774,7 +775,7 @@ skip_listen_ht:
774 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) 775 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
775 goto unlock; 776 goto unlock;
776 777
777 for (i = s_i; i < hashinfo->ehash_size; i++) { 778 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
778 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 779 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
779 spinlock_t *lock = inet_ehash_lockp(hashinfo, i); 780 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
780 struct sock *sk; 781 struct sock *sk;
@@ -797,10 +798,10 @@ skip_listen_ht:
797 goto next_normal; 798 goto next_normal;
798 if (!(r->idiag_states & (1 << sk->sk_state))) 799 if (!(r->idiag_states & (1 << sk->sk_state)))
799 goto next_normal; 800 goto next_normal;
800 if (r->id.idiag_sport != inet->sport && 801 if (r->id.idiag_sport != inet->inet_sport &&
801 r->id.idiag_sport) 802 r->id.idiag_sport)
802 goto next_normal; 803 goto next_normal;
803 if (r->id.idiag_dport != inet->dport && 804 if (r->id.idiag_dport != inet->inet_dport &&
804 r->id.idiag_dport) 805 r->id.idiag_dport)
805 goto next_normal; 806 goto next_normal;
806 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 807 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index eaf3e2c8646a..a2ca6aed763b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <linux/rtnetlink.h> 21#include <linux/rtnetlink.h>
22#include <linux/slab.h>
22 23
23#include <net/inet_frag.h> 24#include <net/inet_frag.h>
24 25
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 625cc5f64c94..2b79377b468d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
64 64
65 atomic_inc(&hashinfo->bsockets); 65 atomic_inc(&hashinfo->bsockets);
66 66
67 inet_sk(sk)->num = snum; 67 inet_sk(sk)->inet_num = snum;
68 sk_add_bind_node(sk, &tb->owners); 68 sk_add_bind_node(sk, &tb->owners);
69 tb->num_owners++; 69 tb->num_owners++;
70 inet_csk(sk)->icsk_bind_hash = tb; 70 inet_csk(sk)->icsk_bind_hash = tb;
@@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
76static void __inet_put_port(struct sock *sk) 76static void __inet_put_port(struct sock *sk)
77{ 77{
78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, 79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
80 hashinfo->bhash_size); 80 hashinfo->bhash_size);
81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
82 struct inet_bind_bucket *tb; 82 struct inet_bind_bucket *tb;
@@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk)
88 __sk_del_bind_node(sk); 88 __sk_del_bind_node(sk);
89 tb->num_owners--; 89 tb->num_owners--;
90 inet_csk(sk)->icsk_bind_hash = NULL; 90 inet_csk(sk)->icsk_bind_hash = NULL;
91 inet_sk(sk)->num = 0; 91 inet_sk(sk)->inet_num = 0;
92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
93 spin_unlock(&head->lock); 93 spin_unlock(&head->lock);
94} 94}
@@ -105,7 +105,7 @@ EXPORT_SYMBOL(inet_put_port);
105void __inet_inherit_port(struct sock *sk, struct sock *child) 105void __inet_inherit_port(struct sock *sk, struct sock *child)
106{ 106{
107 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 107 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
108 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, 108 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
109 table->bhash_size); 109 table->bhash_size);
110 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_hashbucket *head = &table->bhash[bhash];
111 struct inet_bind_bucket *tb; 111 struct inet_bind_bucket *tb;
@@ -126,9 +126,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
126 int score = -1; 126 int score = -1;
127 struct inet_sock *inet = inet_sk(sk); 127 struct inet_sock *inet = inet_sk(sk);
128 128
129 if (net_eq(sock_net(sk), net) && inet->num == hnum && 129 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
130 !ipv6_only_sock(sk)) { 130 !ipv6_only_sock(sk)) {
131 __be32 rcv_saddr = inet->rcv_saddr; 131 __be32 rcv_saddr = inet->inet_rcv_saddr;
132 score = sk->sk_family == PF_INET ? 1 : 0; 132 score = sk->sk_family == PF_INET ? 1 : 0;
133 if (rcv_saddr) { 133 if (rcv_saddr) {
134 if (rcv_saddr != daddr) 134 if (rcv_saddr != daddr)
@@ -209,7 +209,7 @@ struct sock * __inet_lookup_established(struct net *net,
209 * have wildcards anyways. 209 * have wildcards anyways.
210 */ 210 */
211 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 211 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
212 unsigned int slot = hash & (hashinfo->ehash_size - 1); 212 unsigned int slot = hash & hashinfo->ehash_mask;
213 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 213 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
214 214
215 rcu_read_lock(); 215 rcu_read_lock();
@@ -273,18 +273,20 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
273{ 273{
274 struct inet_hashinfo *hinfo = death_row->hashinfo; 274 struct inet_hashinfo *hinfo = death_row->hashinfo;
275 struct inet_sock *inet = inet_sk(sk); 275 struct inet_sock *inet = inet_sk(sk);
276 __be32 daddr = inet->rcv_saddr; 276 __be32 daddr = inet->inet_rcv_saddr;
277 __be32 saddr = inet->daddr; 277 __be32 saddr = inet->inet_daddr;
278 int dif = sk->sk_bound_dev_if; 278 int dif = sk->sk_bound_dev_if;
279 INET_ADDR_COOKIE(acookie, saddr, daddr) 279 INET_ADDR_COOKIE(acookie, saddr, daddr)
280 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 280 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
281 struct net *net = sock_net(sk); 281 struct net *net = sock_net(sk);
282 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 282 unsigned int hash = inet_ehashfn(net, daddr, lport,
283 saddr, inet->inet_dport);
283 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 284 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
284 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 285 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
285 struct sock *sk2; 286 struct sock *sk2;
286 const struct hlist_nulls_node *node; 287 const struct hlist_nulls_node *node;
287 struct inet_timewait_sock *tw; 288 struct inet_timewait_sock *tw;
289 int twrefcnt = 0;
288 290
289 spin_lock(lock); 291 spin_lock(lock);
290 292
@@ -312,25 +314,28 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
312unique: 314unique:
313 /* Must record num and sport now. Otherwise we will see 315 /* Must record num and sport now. Otherwise we will see
314 * in hash table socket with a funny identity. */ 316 * in hash table socket with a funny identity. */
315 inet->num = lport; 317 inet->inet_num = lport;
316 inet->sport = htons(lport); 318 inet->inet_sport = htons(lport);
317 sk->sk_hash = hash; 319 sk->sk_hash = hash;
318 WARN_ON(!sk_unhashed(sk)); 320 WARN_ON(!sk_unhashed(sk));
319 __sk_nulls_add_node_rcu(sk, &head->chain); 321 __sk_nulls_add_node_rcu(sk, &head->chain);
322 if (tw) {
323 twrefcnt = inet_twsk_unhash(tw);
324 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
325 }
320 spin_unlock(lock); 326 spin_unlock(lock);
327 if (twrefcnt)
328 inet_twsk_put(tw);
321 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 329 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
322 330
323 if (twp) { 331 if (twp) {
324 *twp = tw; 332 *twp = tw;
325 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
326 } else if (tw) { 333 } else if (tw) {
327 /* Silly. Should hash-dance instead... */ 334 /* Silly. Should hash-dance instead... */
328 inet_twsk_deschedule(tw, death_row); 335 inet_twsk_deschedule(tw, death_row);
329 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
330 336
331 inet_twsk_put(tw); 337 inet_twsk_put(tw);
332 } 338 }
333
334 return 0; 339 return 0;
335 340
336not_unique: 341not_unique:
@@ -341,16 +346,18 @@ not_unique:
341static inline u32 inet_sk_port_offset(const struct sock *sk) 346static inline u32 inet_sk_port_offset(const struct sock *sk)
342{ 347{
343 const struct inet_sock *inet = inet_sk(sk); 348 const struct inet_sock *inet = inet_sk(sk);
344 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 349 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
345 inet->dport); 350 inet->inet_daddr,
351 inet->inet_dport);
346} 352}
347 353
348void __inet_hash_nolisten(struct sock *sk) 354int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
349{ 355{
350 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 356 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
351 struct hlist_nulls_head *list; 357 struct hlist_nulls_head *list;
352 spinlock_t *lock; 358 spinlock_t *lock;
353 struct inet_ehash_bucket *head; 359 struct inet_ehash_bucket *head;
360 int twrefcnt = 0;
354 361
355 WARN_ON(!sk_unhashed(sk)); 362 WARN_ON(!sk_unhashed(sk));
356 363
@@ -361,8 +368,13 @@ void __inet_hash_nolisten(struct sock *sk)
361 368
362 spin_lock(lock); 369 spin_lock(lock);
363 __sk_nulls_add_node_rcu(sk, list); 370 __sk_nulls_add_node_rcu(sk, list);
371 if (tw) {
372 WARN_ON(sk->sk_hash != tw->tw_hash);
373 twrefcnt = inet_twsk_unhash(tw);
374 }
364 spin_unlock(lock); 375 spin_unlock(lock);
365 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 376 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
377 return twrefcnt;
366} 378}
367EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 379EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
368 380
@@ -372,7 +384,7 @@ static void __inet_hash(struct sock *sk)
372 struct inet_listen_hashbucket *ilb; 384 struct inet_listen_hashbucket *ilb;
373 385
374 if (sk->sk_state != TCP_LISTEN) { 386 if (sk->sk_state != TCP_LISTEN) {
375 __inet_hash_nolisten(sk); 387 __inet_hash_nolisten(sk, NULL);
376 return; 388 return;
377 } 389 }
378 390
@@ -421,14 +433,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
421 struct sock *sk, u32 port_offset, 433 struct sock *sk, u32 port_offset,
422 int (*check_established)(struct inet_timewait_death_row *, 434 int (*check_established)(struct inet_timewait_death_row *,
423 struct sock *, __u16, struct inet_timewait_sock **), 435 struct sock *, __u16, struct inet_timewait_sock **),
424 void (*hash)(struct sock *sk)) 436 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
425{ 437{
426 struct inet_hashinfo *hinfo = death_row->hashinfo; 438 struct inet_hashinfo *hinfo = death_row->hashinfo;
427 const unsigned short snum = inet_sk(sk)->num; 439 const unsigned short snum = inet_sk(sk)->inet_num;
428 struct inet_bind_hashbucket *head; 440 struct inet_bind_hashbucket *head;
429 struct inet_bind_bucket *tb; 441 struct inet_bind_bucket *tb;
430 int ret; 442 int ret;
431 struct net *net = sock_net(sk); 443 struct net *net = sock_net(sk);
444 int twrefcnt = 1;
432 445
433 if (!snum) { 446 if (!snum) {
434 int i, remaining, low, high, port; 447 int i, remaining, low, high, port;
@@ -452,7 +465,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
452 * unique enough. 465 * unique enough.
453 */ 466 */
454 inet_bind_bucket_for_each(tb, node, &head->chain) { 467 inet_bind_bucket_for_each(tb, node, &head->chain) {
455 if (ib_net(tb) == net && tb->port == port) { 468 if (net_eq(ib_net(tb), net) &&
469 tb->port == port) {
456 if (tb->fastreuse >= 0) 470 if (tb->fastreuse >= 0)
457 goto next_port; 471 goto next_port;
458 WARN_ON(hlist_empty(&tb->owners)); 472 WARN_ON(hlist_empty(&tb->owners));
@@ -485,14 +499,19 @@ ok:
485 /* Head lock still held and bh's disabled */ 499 /* Head lock still held and bh's disabled */
486 inet_bind_hash(sk, tb, port); 500 inet_bind_hash(sk, tb, port);
487 if (sk_unhashed(sk)) { 501 if (sk_unhashed(sk)) {
488 inet_sk(sk)->sport = htons(port); 502 inet_sk(sk)->inet_sport = htons(port);
489 hash(sk); 503 twrefcnt += hash(sk, tw);
490 } 504 }
505 if (tw)
506 twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
491 spin_unlock(&head->lock); 507 spin_unlock(&head->lock);
492 508
493 if (tw) { 509 if (tw) {
494 inet_twsk_deschedule(tw, death_row); 510 inet_twsk_deschedule(tw, death_row);
495 inet_twsk_put(tw); 511 while (twrefcnt) {
512 twrefcnt--;
513 inet_twsk_put(tw);
514 }
496 } 515 }
497 516
498 ret = 0; 517 ret = 0;
@@ -503,7 +522,7 @@ ok:
503 tb = inet_csk(sk)->icsk_bind_hash; 522 tb = inet_csk(sk)->icsk_bind_hash;
504 spin_lock_bh(&head->lock); 523 spin_lock_bh(&head->lock);
505 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 524 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
506 hash(sk); 525 hash(sk, NULL);
507 spin_unlock_bh(&head->lock); 526 spin_unlock_bh(&head->lock);
508 return 0; 527 return 0;
509 } else { 528 } else {
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 6a667dae315e..47038cb6c138 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
64 if (iph->ihl != IPH_LEN_WO_OPTIONS) 64 if (iph->ihl != IPH_LEN_WO_OPTIONS)
65 return -1; 65 return -1;
66 66
67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack 67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
68 || tcph->rst || tcph->syn || tcph->fin) 68 tcph->rst || tcph->syn || tcph->fin)
69 return -1; 69 return -1;
70 70
71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) 71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
72 return -1; 72 return -1;
73 73
74 if (tcph->doff != TCPH_LEN_WO_OPTIONS 74 if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
75 && tcph->doff != TCPH_LEN_W_TIMESTAMP) 75 tcph->doff != TCPH_LEN_W_TIMESTAMP)
76 return -1; 76 return -1;
77 77
78 /* check tcp options (only timestamp allowed) */ 78 /* check tcp options (only timestamp allowed) */
@@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
262 struct iphdr *iph, 262 struct iphdr *iph,
263 struct tcphdr *tcph) 263 struct tcphdr *tcph)
264{ 264{
265 if ((lro_desc->iph->saddr != iph->saddr) 265 if ((lro_desc->iph->saddr != iph->saddr) ||
266 || (lro_desc->iph->daddr != iph->daddr) 266 (lro_desc->iph->daddr != iph->daddr) ||
267 || (lro_desc->tcph->source != tcph->source) 267 (lro_desc->tcph->source != tcph->source) ||
268 || (lro_desc->tcph->dest != tcph->dest)) 268 (lro_desc->tcph->dest != tcph->dest))
269 return -1; 269 return -1;
270 return 0; 270 return 0;
271} 271}
@@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
339 u64 flags; 339 u64 flags;
340 int vlan_hdr_len = 0; 340 int vlan_hdr_len = 0;
341 341
342 if (!lro_mgr->get_skb_header 342 if (!lro_mgr->get_skb_header ||
343 || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, 343 lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
344 &flags, priv)) 344 &flags, priv))
345 goto out; 345 goto out;
346 346
347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) 347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
@@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
351 if (!lro_desc) 351 if (!lro_desc)
352 goto out; 352 goto out;
353 353
354 if ((skb->protocol == htons(ETH_P_8021Q)) 354 if ((skb->protocol == htons(ETH_P_8021Q)) &&
355 && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) 355 !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
356 vlan_hdr_len = VLAN_HLEN; 356 vlan_hdr_len = VLAN_HLEN;
357 357
358 if (!lro_desc->active) { /* start new lro session */ 358 if (!lro_desc->active) { /* start new lro session */
@@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
446 int hdr_len = LRO_MAX_PG_HLEN; 446 int hdr_len = LRO_MAX_PG_HLEN;
447 int vlan_hdr_len = 0; 447 int vlan_hdr_len = 0;
448 448
449 if (!lro_mgr->get_frag_header 449 if (!lro_mgr->get_frag_header ||
450 || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, 450 lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
451 (void *)&tcph, &flags, priv)) { 451 (void *)&tcph, &flags, priv)) {
452 mac_hdr = page_address(frags->page) + frags->page_offset; 452 mac_hdr = page_address(frags->page) + frags->page_offset;
453 goto out1; 453 goto out1;
454 } 454 }
@@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
472 if (!skb) 472 if (!skb)
473 goto out; 473 goto out;
474 474
475 if ((skb->protocol == htons(ETH_P_8021Q)) 475 if ((skb->protocol == htons(ETH_P_8021Q)) &&
476 && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) 476 !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
477 vlan_hdr_len = VLAN_HLEN; 477 vlan_hdr_len = VLAN_HLEN;
478 478
479 iph = (void *)(skb->data + vlan_hdr_len); 479 iph = (void *)(skb->data + vlan_hdr_len);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 13f0781f35cd..c5af909cf701 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -10,44 +10,92 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/kmemcheck.h> 12#include <linux/kmemcheck.h>
13#include <linux/slab.h>
13#include <net/inet_hashtables.h> 14#include <net/inet_hashtables.h>
14#include <net/inet_timewait_sock.h> 15#include <net/inet_timewait_sock.h>
15#include <net/ip.h> 16#include <net/ip.h>
16 17
18
19/**
20 * inet_twsk_unhash - unhash a timewait socket from established hash
21 * @tw: timewait socket
22 *
23 * unhash a timewait socket from established hash, if hashed.
24 * ehash lock must be held by caller.
25 * Returns 1 if caller should call inet_twsk_put() after lock release.
26 */
27int inet_twsk_unhash(struct inet_timewait_sock *tw)
28{
29 if (hlist_nulls_unhashed(&tw->tw_node))
30 return 0;
31
32 hlist_nulls_del_rcu(&tw->tw_node);
33 sk_nulls_node_init(&tw->tw_node);
34 /*
35 * We cannot call inet_twsk_put() ourself under lock,
36 * caller must call it for us.
37 */
38 return 1;
39}
40
41/**
42 * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
43 * @tw: timewait socket
44 * @hashinfo: hashinfo pointer
45 *
46 * unhash a timewait socket from bind hash, if hashed.
47 * bind hash lock must be held by caller.
48 * Returns 1 if caller should call inet_twsk_put() after lock release.
49 */
50int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
51 struct inet_hashinfo *hashinfo)
52{
53 struct inet_bind_bucket *tb = tw->tw_tb;
54
55 if (!tb)
56 return 0;
57
58 __hlist_del(&tw->tw_bind_node);
59 tw->tw_tb = NULL;
60 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
61 /*
62 * We cannot call inet_twsk_put() ourself under lock,
63 * caller must call it for us.
64 */
65 return 1;
66}
67
17/* Must be called with locally disabled BHs. */ 68/* Must be called with locally disabled BHs. */
18static void __inet_twsk_kill(struct inet_timewait_sock *tw, 69static void __inet_twsk_kill(struct inet_timewait_sock *tw,
19 struct inet_hashinfo *hashinfo) 70 struct inet_hashinfo *hashinfo)
20{ 71{
21 struct inet_bind_hashbucket *bhead; 72 struct inet_bind_hashbucket *bhead;
22 struct inet_bind_bucket *tb; 73 int refcnt;
23 /* Unlink from established hashes. */ 74 /* Unlink from established hashes. */
24 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 75 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
25 76
26 spin_lock(lock); 77 spin_lock(lock);
27 if (hlist_nulls_unhashed(&tw->tw_node)) { 78 refcnt = inet_twsk_unhash(tw);
28 spin_unlock(lock);
29 return;
30 }
31 hlist_nulls_del_rcu(&tw->tw_node);
32 sk_nulls_node_init(&tw->tw_node);
33 spin_unlock(lock); 79 spin_unlock(lock);
34 80
35 /* Disassociate with bind bucket. */ 81 /* Disassociate with bind bucket. */
36 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 82 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
37 hashinfo->bhash_size)]; 83 hashinfo->bhash_size)];
84
38 spin_lock(&bhead->lock); 85 spin_lock(&bhead->lock);
39 tb = tw->tw_tb; 86 refcnt += inet_twsk_bind_unhash(tw, hashinfo);
40 __hlist_del(&tw->tw_bind_node);
41 tw->tw_tb = NULL;
42 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
43 spin_unlock(&bhead->lock); 87 spin_unlock(&bhead->lock);
88
44#ifdef SOCK_REFCNT_DEBUG 89#ifdef SOCK_REFCNT_DEBUG
45 if (atomic_read(&tw->tw_refcnt) != 1) { 90 if (atomic_read(&tw->tw_refcnt) != 1) {
46 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", 91 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
47 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 92 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
48 } 93 }
49#endif 94#endif
50 inet_twsk_put(tw); 95 while (refcnt) {
96 inet_twsk_put(tw);
97 refcnt--;
98 }
51} 99}
52 100
53static noinline void inet_twsk_free(struct inet_timewait_sock *tw) 101static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -86,7 +134,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
86 Note, that any socket with inet->num != 0 MUST be bound in 134 Note, that any socket with inet->num != 0 MUST be bound in
87 binding cache, even if it is closed. 135 binding cache, even if it is closed.
88 */ 136 */
89 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, 137 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
90 hashinfo->bhash_size)]; 138 hashinfo->bhash_size)];
91 spin_lock(&bhead->lock); 139 spin_lock(&bhead->lock);
92 tw->tw_tb = icsk->icsk_bind_hash; 140 tw->tw_tb = icsk->icsk_bind_hash;
@@ -101,16 +149,24 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
101 * Should be done before removing sk from established chain 149 * Should be done before removing sk from established chain
102 * because readers are lockless and search established first. 150 * because readers are lockless and search established first.
103 */ 151 */
104 atomic_inc(&tw->tw_refcnt);
105 inet_twsk_add_node_rcu(tw, &ehead->twchain); 152 inet_twsk_add_node_rcu(tw, &ehead->twchain);
106 153
107 /* Step 3: Remove SK from established hash. */ 154 /* Step 3: Remove SK from established hash. */
108 if (__sk_nulls_del_node_init_rcu(sk)) 155 if (__sk_nulls_del_node_init_rcu(sk))
109 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 156 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
110 157
158 /*
159 * Notes :
160 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
161 * - We add one reference for the bhash link
162 * - We add one reference for the ehash link
163 * - We want this refcnt update done before allowing other
164 * threads to find this tw in ehash chain.
165 */
166 atomic_add(1 + 1 + 1, &tw->tw_refcnt);
167
111 spin_unlock(lock); 168 spin_unlock(lock);
112} 169}
113
114EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 170EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
115 171
116struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 172struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
@@ -124,14 +180,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
124 kmemcheck_annotate_bitfield(tw, flags); 180 kmemcheck_annotate_bitfield(tw, flags);
125 181
126 /* Give us an identity. */ 182 /* Give us an identity. */
127 tw->tw_daddr = inet->daddr; 183 tw->tw_daddr = inet->inet_daddr;
128 tw->tw_rcv_saddr = inet->rcv_saddr; 184 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
129 tw->tw_bound_dev_if = sk->sk_bound_dev_if; 185 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
130 tw->tw_num = inet->num; 186 tw->tw_num = inet->inet_num;
131 tw->tw_state = TCP_TIME_WAIT; 187 tw->tw_state = TCP_TIME_WAIT;
132 tw->tw_substate = state; 188 tw->tw_substate = state;
133 tw->tw_sport = inet->sport; 189 tw->tw_sport = inet->inet_sport;
134 tw->tw_dport = inet->dport; 190 tw->tw_dport = inet->inet_dport;
135 tw->tw_family = sk->sk_family; 191 tw->tw_family = sk->sk_family;
136 tw->tw_reuse = sk->sk_reuse; 192 tw->tw_reuse = sk->sk_reuse;
137 tw->tw_hash = sk->sk_hash; 193 tw->tw_hash = sk->sk_hash;
@@ -139,14 +195,18 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
139 tw->tw_transparent = inet->transparent; 195 tw->tw_transparent = inet->transparent;
140 tw->tw_prot = sk->sk_prot_creator; 196 tw->tw_prot = sk->sk_prot_creator;
141 twsk_net_set(tw, hold_net(sock_net(sk))); 197 twsk_net_set(tw, hold_net(sock_net(sk)));
142 atomic_set(&tw->tw_refcnt, 1); 198 /*
199 * Because we use RCU lookups, we should not set tw_refcnt
200 * to a non null value before everything is setup for this
201 * timewait socket.
202 */
203 atomic_set(&tw->tw_refcnt, 0);
143 inet_twsk_dead_node_init(tw); 204 inet_twsk_dead_node_init(tw);
144 __module_get(tw->tw_prot->owner); 205 __module_get(tw->tw_prot->owner);
145 } 206 }
146 207
147 return tw; 208 return tw;
148} 209}
149
150EXPORT_SYMBOL_GPL(inet_twsk_alloc); 210EXPORT_SYMBOL_GPL(inet_twsk_alloc);
151 211
152/* Returns non-zero if quota exceeded. */ 212/* Returns non-zero if quota exceeded. */
@@ -225,7 +285,6 @@ void inet_twdr_hangman(unsigned long data)
225out: 285out:
226 spin_unlock(&twdr->death_lock); 286 spin_unlock(&twdr->death_lock);
227} 287}
228
229EXPORT_SYMBOL_GPL(inet_twdr_hangman); 288EXPORT_SYMBOL_GPL(inet_twdr_hangman);
230 289
231void inet_twdr_twkill_work(struct work_struct *work) 290void inet_twdr_twkill_work(struct work_struct *work)
@@ -256,7 +315,6 @@ void inet_twdr_twkill_work(struct work_struct *work)
256 spin_unlock_bh(&twdr->death_lock); 315 spin_unlock_bh(&twdr->death_lock);
257 } 316 }
258} 317}
259
260EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); 318EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
261 319
262/* These are always called from BH context. See callers in 320/* These are always called from BH context. See callers in
@@ -276,7 +334,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
276 spin_unlock(&twdr->death_lock); 334 spin_unlock(&twdr->death_lock);
277 __inet_twsk_kill(tw, twdr->hashinfo); 335 __inet_twsk_kill(tw, twdr->hashinfo);
278} 336}
279
280EXPORT_SYMBOL(inet_twsk_deschedule); 337EXPORT_SYMBOL(inet_twsk_deschedule);
281 338
282void inet_twsk_schedule(struct inet_timewait_sock *tw, 339void inet_twsk_schedule(struct inet_timewait_sock *tw,
@@ -357,7 +414,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
357 mod_timer(&twdr->tw_timer, jiffies + twdr->period); 414 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
358 spin_unlock(&twdr->death_lock); 415 spin_unlock(&twdr->death_lock);
359} 416}
360
361EXPORT_SYMBOL_GPL(inet_twsk_schedule); 417EXPORT_SYMBOL_GPL(inet_twsk_schedule);
362 418
363void inet_twdr_twcal_tick(unsigned long data) 419void inet_twdr_twcal_tick(unsigned long data)
@@ -418,40 +474,48 @@ out:
418#endif 474#endif
419 spin_unlock(&twdr->death_lock); 475 spin_unlock(&twdr->death_lock);
420} 476}
421
422EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); 477EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
423 478
424void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, 479void inet_twsk_purge(struct inet_hashinfo *hashinfo,
425 struct inet_timewait_death_row *twdr, int family) 480 struct inet_timewait_death_row *twdr, int family)
426{ 481{
427 struct inet_timewait_sock *tw; 482 struct inet_timewait_sock *tw;
428 struct sock *sk; 483 struct sock *sk;
429 struct hlist_nulls_node *node; 484 struct hlist_nulls_node *node;
430 int h; 485 unsigned int slot;
431 486
432 local_bh_disable(); 487 for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
433 for (h = 0; h < (hashinfo->ehash_size); h++) { 488 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
434 struct inet_ehash_bucket *head = 489restart_rcu:
435 inet_ehash_bucket(hashinfo, h); 490 rcu_read_lock();
436 spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
437restart: 491restart:
438 spin_lock(lock); 492 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
439 sk_nulls_for_each(sk, node, &head->twchain) {
440
441 tw = inet_twsk(sk); 493 tw = inet_twsk(sk);
442 if (!net_eq(twsk_net(tw), net) || 494 if ((tw->tw_family != family) ||
443 tw->tw_family != family) 495 atomic_read(&twsk_net(tw)->count))
444 continue; 496 continue;
445 497
446 atomic_inc(&tw->tw_refcnt); 498 if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
447 spin_unlock(lock); 499 continue;
500
501 if (unlikely((tw->tw_family != family) ||
502 atomic_read(&twsk_net(tw)->count))) {
503 inet_twsk_put(tw);
504 goto restart;
505 }
506
507 rcu_read_unlock();
448 inet_twsk_deschedule(tw, twdr); 508 inet_twsk_deschedule(tw, twdr);
449 inet_twsk_put(tw); 509 inet_twsk_put(tw);
450 510 goto restart_rcu;
451 goto restart;
452 } 511 }
453 spin_unlock(lock); 512 /* If the nulls value we got at the end of this lookup is
513 * not the expected one, we must restart lookup.
514 * We probably met an item that was moved to another chain.
515 */
516 if (get_nulls_value(node) != slot)
517 goto restart;
518 rcu_read_unlock();
454 } 519 }
455 local_bh_enable();
456} 520}
457EXPORT_SYMBOL_GPL(inet_twsk_purge); 521EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index b1fbe18feb5a..6bcfe52a9c87 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -67,9 +67,6 @@
67 * ip_id_count: idlock 67 * ip_id_count: idlock
68 */ 68 */
69 69
70/* Exported for inet_getid inline function. */
71DEFINE_SPINLOCK(inet_peer_idlock);
72
73static struct kmem_cache *peer_cachep __read_mostly; 70static struct kmem_cache *peer_cachep __read_mostly;
74 71
75#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
@@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
390 n->v4daddr = daddr; 387 n->v4daddr = daddr;
391 atomic_set(&n->refcnt, 1); 388 atomic_set(&n->refcnt, 1);
392 atomic_set(&n->rid, 0); 389 atomic_set(&n->rid, 0);
393 n->ip_id_count = secure_ip_id(daddr); 390 atomic_set(&n->ip_id_count, secure_ip_id(daddr));
394 n->tcp_ts_stamp = 0; 391 n->tcp_ts_stamp = 0;
395 392
396 write_lock_bh(&peer_pool_lock); 393 write_lock_bh(&peer_pool_lock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a2991bc8e32e..af10942b326c 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -25,6 +25,7 @@
25#include <linux/ip.h> 25#include <linux/ip.h>
26#include <linux/icmp.h> 26#include <linux/icmp.h>
27#include <linux/netdevice.h> 27#include <linux/netdevice.h>
28#include <linux/slab.h>
28#include <net/sock.h> 29#include <net/sock.h>
29#include <net/ip.h> 30#include <net/ip.h>
30#include <net/tcp.h> 31#include <net/tcp.h>
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d3fe10be7219..75347ea70ea0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -32,6 +32,9 @@
32#include <linux/netdevice.h> 32#include <linux/netdevice.h>
33#include <linux/jhash.h> 33#include <linux/jhash.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/slab.h>
36#include <net/route.h>
37#include <net/dst.h>
35#include <net/sock.h> 38#include <net/sock.h>
36#include <net/ip.h> 39#include <net/ip.h>
37#include <net/icmp.h> 40#include <net/icmp.h>
@@ -205,11 +208,35 @@ static void ip_expire(unsigned long arg)
205 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 208 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
206 struct sk_buff *head = qp->q.fragments; 209 struct sk_buff *head = qp->q.fragments;
207 210
208 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 211 rcu_read_lock();
209 if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { 212 head->dev = dev_get_by_index_rcu(net, qp->iif);
210 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 213 if (!head->dev)
211 dev_put(head->dev); 214 goto out_rcu_unlock;
215
216 /*
217 * Only search router table for the head fragment,
218 * when defraging timeout at PRE_ROUTING HOOK.
219 */
220 if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
221 const struct iphdr *iph = ip_hdr(head);
222 int err = ip_route_input(head, iph->daddr, iph->saddr,
223 iph->tos, head->dev);
224 if (unlikely(err))
225 goto out_rcu_unlock;
226
227 /*
228 * Only an end host needs to send an ICMP
229 * "Fragment Reassembly Timeout" message, per RFC792.
230 */
231 if (skb_rtable(head)->rt_type != RTN_LOCAL)
232 goto out_rcu_unlock;
233
212 } 234 }
235
236 /* Send an ICMP "Fragment Reassembly Timeout" message. */
237 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
238out_rcu_unlock:
239 rcu_read_unlock();
213 } 240 }
214out: 241out:
215 spin_unlock(&qp->q.lock); 242 spin_unlock(&qp->q.lock);
@@ -603,7 +630,6 @@ static int zero;
603 630
604static struct ctl_table ip4_frags_ns_ctl_table[] = { 631static struct ctl_table ip4_frags_ns_ctl_table[] = {
605 { 632 {
606 .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
607 .procname = "ipfrag_high_thresh", 633 .procname = "ipfrag_high_thresh",
608 .data = &init_net.ipv4.frags.high_thresh, 634 .data = &init_net.ipv4.frags.high_thresh,
609 .maxlen = sizeof(int), 635 .maxlen = sizeof(int),
@@ -611,7 +637,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
611 .proc_handler = proc_dointvec 637 .proc_handler = proc_dointvec
612 }, 638 },
613 { 639 {
614 .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
615 .procname = "ipfrag_low_thresh", 640 .procname = "ipfrag_low_thresh",
616 .data = &init_net.ipv4.frags.low_thresh, 641 .data = &init_net.ipv4.frags.low_thresh,
617 .maxlen = sizeof(int), 642 .maxlen = sizeof(int),
@@ -619,26 +644,22 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
619 .proc_handler = proc_dointvec 644 .proc_handler = proc_dointvec
620 }, 645 },
621 { 646 {
622 .ctl_name = NET_IPV4_IPFRAG_TIME,
623 .procname = "ipfrag_time", 647 .procname = "ipfrag_time",
624 .data = &init_net.ipv4.frags.timeout, 648 .data = &init_net.ipv4.frags.timeout,
625 .maxlen = sizeof(int), 649 .maxlen = sizeof(int),
626 .mode = 0644, 650 .mode = 0644,
627 .proc_handler = proc_dointvec_jiffies, 651 .proc_handler = proc_dointvec_jiffies,
628 .strategy = sysctl_jiffies
629 }, 652 },
630 { } 653 { }
631}; 654};
632 655
633static struct ctl_table ip4_frags_ctl_table[] = { 656static struct ctl_table ip4_frags_ctl_table[] = {
634 { 657 {
635 .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
636 .procname = "ipfrag_secret_interval", 658 .procname = "ipfrag_secret_interval",
637 .data = &ip4_frags.secret_interval, 659 .data = &ip4_frags.secret_interval,
638 .maxlen = sizeof(int), 660 .maxlen = sizeof(int),
639 .mode = 0644, 661 .mode = 0644,
640 .proc_handler = proc_dointvec_jiffies, 662 .proc_handler = proc_dointvec_jiffies,
641 .strategy = sysctl_jiffies
642 }, 663 },
643 { 664 {
644 .procname = "ipfrag_max_dist", 665 .procname = "ipfrag_max_dist",
@@ -651,13 +672,13 @@ static struct ctl_table ip4_frags_ctl_table[] = {
651 { } 672 { }
652}; 673};
653 674
654static int ip4_frags_ns_ctl_register(struct net *net) 675static int __net_init ip4_frags_ns_ctl_register(struct net *net)
655{ 676{
656 struct ctl_table *table; 677 struct ctl_table *table;
657 struct ctl_table_header *hdr; 678 struct ctl_table_header *hdr;
658 679
659 table = ip4_frags_ns_ctl_table; 680 table = ip4_frags_ns_ctl_table;
660 if (net != &init_net) { 681 if (!net_eq(net, &init_net)) {
661 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); 682 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
662 if (table == NULL) 683 if (table == NULL)
663 goto err_alloc; 684 goto err_alloc;
@@ -675,13 +696,13 @@ static int ip4_frags_ns_ctl_register(struct net *net)
675 return 0; 696 return 0;
676 697
677err_reg: 698err_reg:
678 if (net != &init_net) 699 if (!net_eq(net, &init_net))
679 kfree(table); 700 kfree(table);
680err_alloc: 701err_alloc:
681 return -ENOMEM; 702 return -ENOMEM;
682} 703}
683 704
684static void ip4_frags_ns_ctl_unregister(struct net *net) 705static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
685{ 706{
686 struct ctl_table *table; 707 struct ctl_table *table;
687 708
@@ -709,7 +730,7 @@ static inline void ip4_frags_ctl_register(void)
709} 730}
710#endif 731#endif
711 732
712static int ipv4_frags_init_net(struct net *net) 733static int __net_init ipv4_frags_init_net(struct net *net)
713{ 734{
714 /* 735 /*
715 * Fragment cache limits. We will commit 256K at one time. Should we 736 * Fragment cache limits. We will commit 256K at one time. Should we
@@ -731,7 +752,7 @@ static int ipv4_frags_init_net(struct net *net)
731 return ip4_frags_ns_ctl_register(net); 752 return ip4_frags_ns_ctl_register(net);
732} 753}
733 754
734static void ipv4_frags_exit_net(struct net *net) 755static void __net_exit ipv4_frags_exit_net(struct net *net)
735{ 756{
736 ip4_frags_ns_ctl_unregister(net); 757 ip4_frags_ns_ctl_unregister(net);
737 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); 758 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 143333852624..fe381d12ecdd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/netdevice.h> 20#include <linux/netdevice.h>
@@ -125,7 +126,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 126
126#define HASH_SIZE 16 127#define HASH_SIZE 16
127 128
128static int ipgre_net_id; 129static int ipgre_net_id __read_mostly;
129struct ipgre_net { 130struct ipgre_net {
130 struct ip_tunnel *tunnels[4][HASH_SIZE]; 131 struct ip_tunnel *tunnels[4][HASH_SIZE];
131 132
@@ -156,8 +157,13 @@ struct ipgre_net {
156#define tunnels_r tunnels[2] 157#define tunnels_r tunnels[2]
157#define tunnels_l tunnels[1] 158#define tunnels_l tunnels[1]
158#define tunnels_wc tunnels[0] 159#define tunnels_wc tunnels[0]
160/*
161 * Locking : hash tables are protected by RCU and a spinlock
162 */
163static DEFINE_SPINLOCK(ipgre_lock);
159 164
160static DEFINE_RWLOCK(ipgre_lock); 165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
161 167
162/* Given src, dst and key, find appropriate for input tunnel. */ 168/* Given src, dst and key, find appropriate for input tunnel. */
163 169
@@ -175,7 +181,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
175 ARPHRD_ETHER : ARPHRD_IPGRE; 181 ARPHRD_ETHER : ARPHRD_IPGRE;
176 int score, cand_score = 4; 182 int score, cand_score = 4;
177 183
178 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { 184 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
179 if (local != t->parms.iph.saddr || 185 if (local != t->parms.iph.saddr ||
180 remote != t->parms.iph.daddr || 186 remote != t->parms.iph.daddr ||
181 key != t->parms.i_key || 187 key != t->parms.i_key ||
@@ -200,7 +206,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
200 } 206 }
201 } 207 }
202 208
203 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { 209 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
204 if (remote != t->parms.iph.daddr || 210 if (remote != t->parms.iph.daddr ||
205 key != t->parms.i_key || 211 key != t->parms.i_key ||
206 !(t->dev->flags & IFF_UP)) 212 !(t->dev->flags & IFF_UP))
@@ -224,7 +230,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
224 } 230 }
225 } 231 }
226 232
227 for (t = ign->tunnels_l[h1]; t; t = t->next) { 233 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
228 if ((local != t->parms.iph.saddr && 234 if ((local != t->parms.iph.saddr &&
229 (local != t->parms.iph.daddr || 235 (local != t->parms.iph.daddr ||
230 !ipv4_is_multicast(local))) || 236 !ipv4_is_multicast(local))) ||
@@ -250,7 +256,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
250 } 256 }
251 } 257 }
252 258
253 for (t = ign->tunnels_wc[h1]; t; t = t->next) { 259 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
254 if (t->parms.i_key != key || 260 if (t->parms.i_key != key ||
255 !(t->dev->flags & IFF_UP)) 261 !(t->dev->flags & IFF_UP))
256 continue; 262 continue;
@@ -276,8 +282,9 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
276 if (cand != NULL) 282 if (cand != NULL)
277 return cand; 283 return cand;
278 284
279 if (ign->fb_tunnel_dev->flags & IFF_UP) 285 dev = ign->fb_tunnel_dev;
280 return netdev_priv(ign->fb_tunnel_dev); 286 if (dev->flags & IFF_UP)
287 return netdev_priv(dev);
281 288
282 return NULL; 289 return NULL;
283} 290}
@@ -311,10 +318,10 @@ static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
311{ 318{
312 struct ip_tunnel **tp = ipgre_bucket(ign, t); 319 struct ip_tunnel **tp = ipgre_bucket(ign, t);
313 320
321 spin_lock_bh(&ipgre_lock);
314 t->next = *tp; 322 t->next = *tp;
315 write_lock_bh(&ipgre_lock); 323 rcu_assign_pointer(*tp, t);
316 *tp = t; 324 spin_unlock_bh(&ipgre_lock);
317 write_unlock_bh(&ipgre_lock);
318} 325}
319 326
320static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 327static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
@@ -323,9 +330,9 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
323 330
324 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
325 if (t == *tp) { 332 if (t == *tp) {
326 write_lock_bh(&ipgre_lock); 333 spin_lock_bh(&ipgre_lock);
327 *tp = t->next; 334 *tp = t->next;
328 write_unlock_bh(&ipgre_lock); 335 spin_unlock_bh(&ipgre_lock);
329 break; 336 break;
330 } 337 }
331 } 338 }
@@ -476,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
476 break; 483 break;
477 } 484 }
478 485
479 read_lock(&ipgre_lock); 486 rcu_read_lock();
480 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 487 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
481 flags & GRE_KEY ? 488 flags & GRE_KEY ?
482 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 489 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
@@ -494,7 +501,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
494 t->err_count = 1; 501 t->err_count = 1;
495 t->err_time = jiffies; 502 t->err_time = jiffies;
496out: 503out:
497 read_unlock(&ipgre_lock); 504 rcu_read_unlock();
498 return; 505 return;
499} 506}
500 507
@@ -573,7 +580,7 @@ static int ipgre_rcv(struct sk_buff *skb)
573 580
574 gre_proto = *(__be16 *)(h + 2); 581 gre_proto = *(__be16 *)(h + 2);
575 582
576 read_lock(&ipgre_lock); 583 rcu_read_lock();
577 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 584 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
578 iph->saddr, iph->daddr, key, 585 iph->saddr, iph->daddr, key,
579 gre_proto))) { 586 gre_proto))) {
@@ -647,13 +654,13 @@ static int ipgre_rcv(struct sk_buff *skb)
647 ipgre_ecn_decapsulate(iph, skb); 654 ipgre_ecn_decapsulate(iph, skb);
648 655
649 netif_rx(skb); 656 netif_rx(skb);
650 read_unlock(&ipgre_lock); 657 rcu_read_unlock();
651 return(0); 658 return(0);
652 } 659 }
653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 660 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654 661
655drop: 662drop:
656 read_unlock(&ipgre_lock); 663 rcu_read_unlock();
657drop_nolock: 664drop_nolock:
658 kfree_skb(skb); 665 kfree_skb(skb);
659 return(0); 666 return(0);
@@ -662,7 +669,8 @@ drop_nolock:
662static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 669static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663{ 670{
664 struct ip_tunnel *tunnel = netdev_priv(dev); 671 struct ip_tunnel *tunnel = netdev_priv(dev);
665 struct net_device_stats *stats = &tunnel->dev->stats; 672 struct net_device_stats *stats = &dev->stats;
673 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666 struct iphdr *old_iph = ip_hdr(skb); 674 struct iphdr *old_iph = ip_hdr(skb);
667 struct iphdr *tiph; 675 struct iphdr *tiph;
668 u8 tos; 676 u8 tos;
@@ -786,7 +794,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
786 } 794 }
787 795
788 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 796 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
789 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); 797 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
790 ip_rt_put(rt); 798 ip_rt_put(rt);
791 goto tx_error; 799 goto tx_error;
792 } 800 }
@@ -803,14 +811,16 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
803 tunnel->err_count = 0; 811 tunnel->err_count = 0;
804 } 812 }
805 813
806 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; 814 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
807 815
808 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 816 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
809 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 817 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
810 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 818 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819 if (max_headroom > dev->needed_headroom)
820 dev->needed_headroom = max_headroom;
811 if (!new_skb) { 821 if (!new_skb) {
812 ip_rt_put(rt); 822 ip_rt_put(rt);
813 stats->tx_dropped++; 823 txq->tx_dropped++;
814 dev_kfree_skb(skb); 824 dev_kfree_skb(skb);
815 return NETDEV_TX_OK; 825 return NETDEV_TX_OK;
816 } 826 }
@@ -1137,12 +1147,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1137 1147
1138 if (saddr) 1148 if (saddr)
1139 memcpy(&iph->saddr, saddr, 4); 1149 memcpy(&iph->saddr, saddr, 4);
1140 1150 if (daddr)
1141 if (daddr) {
1142 memcpy(&iph->daddr, daddr, 4); 1151 memcpy(&iph->daddr, daddr, 4);
1143 return t->hlen; 1152 if (iph->daddr)
1144 }
1145 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1146 return t->hlen; 1153 return t->hlen;
1147 1154
1148 return -t->hlen; 1155 return -t->hlen;
@@ -1283,33 +1290,27 @@ static const struct net_protocol ipgre_protocol = {
1283 .netns_ok = 1, 1290 .netns_ok = 1,
1284}; 1291};
1285 1292
1286static void ipgre_destroy_tunnels(struct ipgre_net *ign) 1293static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1287{ 1294{
1288 int prio; 1295 int prio;
1289 1296
1290 for (prio = 0; prio < 4; prio++) { 1297 for (prio = 0; prio < 4; prio++) {
1291 int h; 1298 int h;
1292 for (h = 0; h < HASH_SIZE; h++) { 1299 for (h = 0; h < HASH_SIZE; h++) {
1293 struct ip_tunnel *t; 1300 struct ip_tunnel *t = ign->tunnels[prio][h];
1294 while ((t = ign->tunnels[prio][h]) != NULL) 1301
1295 unregister_netdevice(t->dev); 1302 while (t != NULL) {
1303 unregister_netdevice_queue(t->dev, head);
1304 t = t->next;
1305 }
1296 } 1306 }
1297 } 1307 }
1298} 1308}
1299 1309
1300static int ipgre_init_net(struct net *net) 1310static int __net_init ipgre_init_net(struct net *net)
1301{ 1311{
1312 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1302 int err; 1313 int err;
1303 struct ipgre_net *ign;
1304
1305 err = -ENOMEM;
1306 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1307 if (ign == NULL)
1308 goto err_alloc;
1309
1310 err = net_assign_generic(net, ipgre_net_id, ign);
1311 if (err < 0)
1312 goto err_assign;
1313 1314
1314 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1315 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315 ipgre_tunnel_setup); 1316 ipgre_tunnel_setup);
@@ -1330,27 +1331,26 @@ static int ipgre_init_net(struct net *net)
1330err_reg_dev: 1331err_reg_dev:
1331 free_netdev(ign->fb_tunnel_dev); 1332 free_netdev(ign->fb_tunnel_dev);
1332err_alloc_dev: 1333err_alloc_dev:
1333 /* nothing */
1334err_assign:
1335 kfree(ign);
1336err_alloc:
1337 return err; 1334 return err;
1338} 1335}
1339 1336
1340static void ipgre_exit_net(struct net *net) 1337static void __net_exit ipgre_exit_net(struct net *net)
1341{ 1338{
1342 struct ipgre_net *ign; 1339 struct ipgre_net *ign;
1340 LIST_HEAD(list);
1343 1341
1344 ign = net_generic(net, ipgre_net_id); 1342 ign = net_generic(net, ipgre_net_id);
1345 rtnl_lock(); 1343 rtnl_lock();
1346 ipgre_destroy_tunnels(ign); 1344 ipgre_destroy_tunnels(ign, &list);
1345 unregister_netdevice_many(&list);
1347 rtnl_unlock(); 1346 rtnl_unlock();
1348 kfree(ign);
1349} 1347}
1350 1348
1351static struct pernet_operations ipgre_net_ops = { 1349static struct pernet_operations ipgre_net_ops = {
1352 .init = ipgre_init_net, 1350 .init = ipgre_init_net,
1353 .exit = ipgre_exit_net, 1351 .exit = ipgre_exit_net,
1352 .id = &ipgre_net_id,
1353 .size = sizeof(struct ipgre_net),
1354}; 1354};
1355 1355
1356static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1356static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1471,7 +1471,7 @@ static void ipgre_tap_setup(struct net_device *dev)
1471 dev->features |= NETIF_F_NETNS_LOCAL; 1471 dev->features |= NETIF_F_NETNS_LOCAL;
1472} 1472}
1473 1473
1474static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], 1474static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475 struct nlattr *data[]) 1475 struct nlattr *data[])
1476{ 1476{
1477 struct ip_tunnel *nt; 1477 struct ip_tunnel *nt;
@@ -1665,15 +1665,16 @@ static int __init ipgre_init(void)
1665 1665
1666 printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); 1666 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667 1667
1668 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { 1668 err = register_pernet_device(&ipgre_net_ops);
1669 if (err < 0)
1670 return err;
1671
1672 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1673 if (err < 0) {
1669 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1674 printk(KERN_INFO "ipgre init: can't add protocol\n");
1670 return -EAGAIN; 1675 goto add_proto_failed;
1671 } 1676 }
1672 1677
1673 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1674 if (err < 0)
1675 goto gen_device_failed;
1676
1677 err = rtnl_link_register(&ipgre_link_ops); 1678 err = rtnl_link_register(&ipgre_link_ops);
1678 if (err < 0) 1679 if (err < 0)
1679 goto rtnl_link_failed; 1680 goto rtnl_link_failed;
@@ -1688,9 +1689,9 @@ out:
1688tap_ops_failed: 1689tap_ops_failed:
1689 rtnl_link_unregister(&ipgre_link_ops); 1690 rtnl_link_unregister(&ipgre_link_ops);
1690rtnl_link_failed: 1691rtnl_link_failed:
1691 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1692gen_device_failed:
1693 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1692 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1693add_proto_failed:
1694 unregister_pernet_device(&ipgre_net_ops);
1694 goto out; 1695 goto out;
1695} 1696}
1696 1697
@@ -1698,9 +1699,9 @@ static void __exit ipgre_fini(void)
1698{ 1699{
1699 rtnl_link_unregister(&ipgre_tap_ops); 1700 rtnl_link_unregister(&ipgre_tap_ops);
1700 rtnl_link_unregister(&ipgre_link_ops); 1701 rtnl_link_unregister(&ipgre_link_ops);
1701 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1702 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1702 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1703 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704 unregister_pernet_device(&ipgre_net_ops);
1704} 1705}
1705 1706
1706module_init(ipgre_init); 1707module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 6c98b43badf4..f8ab7a380d4a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -119,6 +119,7 @@
119#include <linux/kernel.h> 119#include <linux/kernel.h>
120#include <linux/string.h> 120#include <linux/string.h>
121#include <linux/errno.h> 121#include <linux/errno.h>
122#include <linux/slab.h>
122 123
123#include <linux/net.h> 124#include <linux/net.h>
124#include <linux/socket.h> 125#include <linux/socket.h>
@@ -161,10 +162,10 @@ int ip_call_ra_chain(struct sk_buff *skb)
161 /* If socket is bound to an interface, only report 162 /* If socket is bound to an interface, only report
162 * the packet if it came from that interface. 163 * the packet if it came from that interface.
163 */ 164 */
164 if (sk && inet_sk(sk)->num == protocol && 165 if (sk && inet_sk(sk)->inet_num == protocol &&
165 (!sk->sk_bound_dev_if || 166 (!sk->sk_bound_dev_if ||
166 sk->sk_bound_dev_if == dev->ifindex) && 167 sk->sk_bound_dev_if == dev->ifindex) &&
167 sock_net(sk) == dev_net(dev)) { 168 net_eq(sock_net(sk), dev_net(dev))) {
168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 169 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { 170 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
170 read_unlock(&ip_ra_lock); 171 read_unlock(&ip_ra_lock);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 94bf105ef3c9..4c09a31fd140 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/types.h> 15#include <linux/types.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f9895180f481..d1bcc9f21d4f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -51,6 +51,7 @@
51#include <linux/string.h> 51#include <linux/string.h>
52#include <linux/errno.h> 52#include <linux/errno.h>
53#include <linux/highmem.h> 53#include <linux/highmem.h>
54#include <linux/slab.h>
54 55
55#include <linux/socket.h> 56#include <linux/socket.h>
56#include <linux/sockios.h> 57#include <linux/sockios.h>
@@ -119,7 +120,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 newskb->pkt_type = PACKET_LOOPBACK; 120 newskb->pkt_type = PACKET_LOOPBACK;
120 newskb->ip_summed = CHECKSUM_UNNECESSARY; 121 newskb->ip_summed = CHECKSUM_UNNECESSARY;
121 WARN_ON(!skb_dst(newskb)); 122 WARN_ON(!skb_dst(newskb));
122 netif_rx(newskb); 123 netif_rx_ni(newskb);
123 return 0; 124 return 0;
124} 125}
125 126
@@ -254,7 +255,7 @@ int ip_mc_output(struct sk_buff *skb)
254 */ 255 */
255 256
256 if (rt->rt_flags&RTCF_MULTICAST) { 257 if (rt->rt_flags&RTCF_MULTICAST) {
257 if ((!sk || inet_sk(sk)->mc_loop) 258 if (sk_mc_loop(sk)
258#ifdef CONFIG_IP_MROUTE 259#ifdef CONFIG_IP_MROUTE
259 /* Small optimization: do not loopback not local frames, 260 /* Small optimization: do not loopback not local frames,
260 which returned after forwarding; they will be dropped 261 which returned after forwarding; they will be dropped
@@ -264,9 +265,11 @@ int ip_mc_output(struct sk_buff *skb)
264 265
265 This check is duplicated in ip_mr_input at the moment. 266 This check is duplicated in ip_mr_input at the moment.
266 */ 267 */
267 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) 268 &&
269 ((rt->rt_flags & RTCF_LOCAL) ||
270 !(IPCB(skb)->flags & IPSKB_FORWARDED))
268#endif 271#endif
269 ) { 272 ) {
270 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 273 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
271 if (newskb) 274 if (newskb)
272 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, 275 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
@@ -329,7 +332,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
329 __be32 daddr; 332 __be32 daddr;
330 333
331 /* Use correct destination address if we have options. */ 334 /* Use correct destination address if we have options. */
332 daddr = inet->daddr; 335 daddr = inet->inet_daddr;
333 if(opt && opt->srr) 336 if(opt && opt->srr)
334 daddr = opt->faddr; 337 daddr = opt->faddr;
335 338
@@ -338,13 +341,13 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
338 .mark = sk->sk_mark, 341 .mark = sk->sk_mark,
339 .nl_u = { .ip4_u = 342 .nl_u = { .ip4_u =
340 { .daddr = daddr, 343 { .daddr = daddr,
341 .saddr = inet->saddr, 344 .saddr = inet->inet_saddr,
342 .tos = RT_CONN_FLAGS(sk) } }, 345 .tos = RT_CONN_FLAGS(sk) } },
343 .proto = sk->sk_protocol, 346 .proto = sk->sk_protocol,
344 .flags = inet_sk_flowi_flags(sk), 347 .flags = inet_sk_flowi_flags(sk),
345 .uli_u = { .ports = 348 .uli_u = { .ports =
346 { .sport = inet->sport, 349 { .sport = inet->inet_sport,
347 .dport = inet->dport } } }; 350 .dport = inet->inet_dport } } };
348 351
349 /* If this fails, retransmit mechanism of transport layer will 352 /* If this fails, retransmit mechanism of transport layer will
350 * keep trying until route appears or the connection times 353 * keep trying until route appears or the connection times
@@ -379,7 +382,7 @@ packet_routed:
379 382
380 if (opt && opt->optlen) { 383 if (opt && opt->optlen) {
381 iph->ihl += opt->optlen >> 2; 384 iph->ihl += opt->optlen >> 2;
382 ip_options_build(skb, opt, inet->daddr, rt, 0); 385 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
383 } 386 }
384 387
385 ip_select_ident_more(iph, &rt->u.dst, sk, 388 ip_select_ident_more(iph, &rt->u.dst, sk,
@@ -501,8 +504,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
501 if (skb->sk) { 504 if (skb->sk) {
502 frag->sk = skb->sk; 505 frag->sk = skb->sk;
503 frag->destructor = sock_wfree; 506 frag->destructor = sock_wfree;
504 truesizes += frag->truesize;
505 } 507 }
508 truesizes += frag->truesize;
506 } 509 }
507 510
508 /* Everything is OK. Generate! */ 511 /* Everything is OK. Generate! */
@@ -846,7 +849,8 @@ int ip_append_data(struct sock *sk,
846 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 849 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
847 850
848 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 851 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
849 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); 852 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
853 mtu-exthdrlen);
850 return -EMSGSIZE; 854 return -EMSGSIZE;
851 } 855 }
852 856
@@ -1100,7 +1104,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1100 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1104 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1101 1105
1102 if (inet->cork.length + size > 0xFFFF - fragheaderlen) { 1106 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1103 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); 1107 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1104 return -EMSGSIZE; 1108 return -EMSGSIZE;
1105 } 1109 }
1106 1110
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e982b5c1ee17..1e64dabbd232 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -23,6 +23,7 @@
23#include <linux/icmp.h> 23#include <linux/icmp.h>
24#include <linux/inetdevice.h> 24#include <linux/inetdevice.h>
25#include <linux/netdevice.h> 25#include <linux/netdevice.h>
26#include <linux/slab.h>
26#include <net/sock.h> 27#include <net/sock.h>
27#include <net/ip.h> 28#include <net/ip.h>
28#include <net/icmp.h> 29#include <net/icmp.h>
@@ -245,7 +246,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
245{ 246{
246 struct ip_ra_chain *ra, *new_ra, **rap; 247 struct ip_ra_chain *ra, *new_ra, **rap;
247 248
248 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) 249 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
249 return -EINVAL; 250 return -EINVAL;
250 251
251 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 252 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
@@ -451,7 +452,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
451 (1<<IP_TTL) | (1<<IP_HDRINCL) | 452 (1<<IP_TTL) | (1<<IP_HDRINCL) |
452 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 453 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
453 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 454 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
454 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || 455 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
456 (1<<IP_MINTTL))) ||
455 optname == IP_MULTICAST_TTL || 457 optname == IP_MULTICAST_TTL ||
456 optname == IP_MULTICAST_ALL || 458 optname == IP_MULTICAST_ALL ||
457 optname == IP_MULTICAST_LOOP || 459 optname == IP_MULTICAST_LOOP ||
@@ -480,7 +482,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
480 case IP_OPTIONS: 482 case IP_OPTIONS:
481 { 483 {
482 struct ip_options *opt = NULL; 484 struct ip_options *opt = NULL;
483 if (optlen > 40 || optlen < 0) 485 if (optlen > 40)
484 goto e_inval; 486 goto e_inval;
485 err = ip_options_get_from_user(sock_net(sk), &opt, 487 err = ip_options_get_from_user(sock_net(sk), &opt,
486 optval, optlen); 488 optval, optlen);
@@ -492,7 +494,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
492 if (sk->sk_family == PF_INET || 494 if (sk->sk_family == PF_INET ||
493 (!((1 << sk->sk_state) & 495 (!((1 << sk->sk_state) &
494 (TCPF_LISTEN | TCPF_CLOSE)) && 496 (TCPF_LISTEN | TCPF_CLOSE)) &&
495 inet->daddr != LOOPBACK4_IPV6)) { 497 inet->inet_daddr != LOOPBACK4_IPV6)) {
496#endif 498#endif
497 if (inet->opt) 499 if (inet->opt)
498 icsk->icsk_ext_hdr_len -= inet->opt->optlen; 500 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
@@ -575,7 +577,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
575 inet->hdrincl = val ? 1 : 0; 577 inet->hdrincl = val ? 1 : 0;
576 break; 578 break;
577 case IP_MTU_DISCOVER: 579 case IP_MTU_DISCOVER:
578 if (val < 0 || val > 3) 580 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
579 goto e_inval; 581 goto e_inval;
580 inet->pmtudisc = val; 582 inet->pmtudisc = val;
581 break; 583 break;
@@ -936,6 +938,14 @@ mc_msf_out:
936 inet->transparent = !!val; 938 inet->transparent = !!val;
937 break; 939 break;
938 940
941 case IP_MINTTL:
942 if (optlen < 1)
943 goto e_inval;
944 if (val < 0 || val > 255)
945 goto e_inval;
946 inet->min_ttl = val;
947 break;
948
939 default: 949 default:
940 err = -ENOPROTOOPT; 950 err = -ENOPROTOOPT;
941 break; 951 break;
@@ -1180,8 +1190,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1180 if (inet->cmsg_flags & IP_CMSG_PKTINFO) { 1190 if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
1181 struct in_pktinfo info; 1191 struct in_pktinfo info;
1182 1192
1183 info.ipi_addr.s_addr = inet->rcv_saddr; 1193 info.ipi_addr.s_addr = inet->inet_rcv_saddr;
1184 info.ipi_spec_dst.s_addr = inet->rcv_saddr; 1194 info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
1185 info.ipi_ifindex = inet->mc_index; 1195 info.ipi_ifindex = inet->mc_index;
1186 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 1196 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
1187 } 1197 }
@@ -1198,6 +1208,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1198 case IP_TRANSPARENT: 1208 case IP_TRANSPARENT:
1199 val = inet->transparent; 1209 val = inet->transparent;
1200 break; 1210 break;
1211 case IP_MINTTL:
1212 val = inet->min_ttl;
1213 break;
1201 default: 1214 default:
1202 release_sock(sk); 1215 release_sock(sk);
1203 return -ENOPROTOOPT; 1216 return -ENOPROTOOPT;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 38fbf04150ae..629067571f02 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -25,6 +25,7 @@
25 25
26static void ipcomp4_err(struct sk_buff *skb, u32 info) 26static void ipcomp4_err(struct sk_buff *skb, u32 info)
27{ 27{
28 struct net *net = dev_net(skb->dev);
28 __be32 spi; 29 __be32 spi;
29 struct iphdr *iph = (struct iphdr *)skb->data; 30 struct iphdr *iph = (struct iphdr *)skb->data;
30 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
@@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
35 return; 36 return;
36 37
37 spi = htonl(ntohs(ipch->cpi)); 38 spi = htonl(ntohs(ipch->cpi));
38 x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, 39 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr,
39 spi, IPPROTO_COMP, AF_INET); 40 spi, IPPROTO_COMP, AF_INET);
40 if (!x) 41 if (!x)
41 return; 42 return;
@@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
47/* We always hold one tunnel user reference to indicate a tunnel */ 48/* We always hold one tunnel user reference to indicate a tunnel */
48static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) 49static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
49{ 50{
51 struct net *net = xs_net(x);
50 struct xfrm_state *t; 52 struct xfrm_state *t;
51 53
52 t = xfrm_state_alloc(&init_net); 54 t = xfrm_state_alloc(net);
53 if (t == NULL) 55 if (t == NULL)
54 goto out; 56 goto out;
55 57
@@ -61,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
61 t->props.mode = x->props.mode; 63 t->props.mode = x->props.mode;
62 t->props.saddr.a4 = x->props.saddr.a4; 64 t->props.saddr.a4 = x->props.saddr.a4;
63 t->props.flags = x->props.flags; 65 t->props.flags = x->props.flags;
66 memcpy(&t->mark, &x->mark, sizeof(t->mark));
64 67
65 if (xfrm_init_state(t)) 68 if (xfrm_init_state(t))
66 goto error; 69 goto error;
@@ -82,10 +85,12 @@ error:
82 */ 85 */
83static int ipcomp_tunnel_attach(struct xfrm_state *x) 86static int ipcomp_tunnel_attach(struct xfrm_state *x)
84{ 87{
88 struct net *net = xs_net(x);
85 int err = 0; 89 int err = 0;
86 struct xfrm_state *t; 90 struct xfrm_state *t;
91 u32 mark = x->mark.v & x->mark.m;
87 92
88 t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, 93 t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET); 94 x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
90 if (!t) { 95 if (!t) {
91 t = ipcomp_tunnel_create(x); 96 t = ipcomp_tunnel_create(x);
@@ -124,16 +129,12 @@ static int ipcomp4_init_state(struct xfrm_state *x)
124 if (x->props.mode == XFRM_MODE_TUNNEL) { 129 if (x->props.mode == XFRM_MODE_TUNNEL) {
125 err = ipcomp_tunnel_attach(x); 130 err = ipcomp_tunnel_attach(x);
126 if (err) 131 if (err)
127 goto error_tunnel; 132 goto out;
128 } 133 }
129 134
130 err = 0; 135 err = 0;
131out: 136out:
132 return err; 137 return err;
133
134error_tunnel:
135 ipcomp_destroy(x);
136 goto out;
137} 138}
138 139
139static const struct xfrm_type ipcomp_type = { 140static const struct xfrm_type ipcomp_type = {
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f8d04c256454..067ce9e043dc 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -53,6 +53,7 @@
53#include <linux/root_dev.h> 53#include <linux/root_dev.h>
54#include <linux/delay.h> 54#include <linux/delay.h>
55#include <linux/nfs_fs.h> 55#include <linux/nfs_fs.h>
56#include <linux/slab.h>
56#include <net/net_namespace.h> 57#include <net/net_namespace.h>
57#include <net/arp.h> 58#include <net/arp.h>
58#include <net/ip.h> 59#include <net/ip.h>
@@ -187,6 +188,16 @@ struct ic_device {
187static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ 188static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
188static struct net_device *ic_dev __initdata = NULL; /* Selected device */ 189static struct net_device *ic_dev __initdata = NULL; /* Selected device */
189 190
191static bool __init ic_device_match(struct net_device *dev)
192{
193 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
194 (!(dev->flags & IFF_LOOPBACK) &&
195 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
196 strncmp(dev->name, "dummy", 5)))
197 return true;
198 return false;
199}
200
190static int __init ic_open_devs(void) 201static int __init ic_open_devs(void)
191{ 202{
192 struct ic_device *d, **last; 203 struct ic_device *d, **last;
@@ -207,10 +218,7 @@ static int __init ic_open_devs(void)
207 for_each_netdev(&init_net, dev) { 218 for_each_netdev(&init_net, dev) {
208 if (dev->flags & IFF_LOOPBACK) 219 if (dev->flags & IFF_LOOPBACK)
209 continue; 220 continue;
210 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : 221 if (ic_device_match(dev)) {
211 (!(dev->flags & IFF_LOOPBACK) &&
212 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
213 strncmp(dev->name, "dummy", 5))) {
214 int able = 0; 222 int able = 0;
215 if (dev->mtu >= 364) 223 if (dev->mtu >= 364)
216 able |= IC_BOOTP; 224 able |= IC_BOOTP;
@@ -228,7 +236,7 @@ static int __init ic_open_devs(void)
228 } 236 }
229 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { 237 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
230 rtnl_unlock(); 238 rtnl_unlock();
231 return -1; 239 return -ENOMEM;
232 } 240 }
233 d->dev = dev; 241 d->dev = dev;
234 *last = d; 242 *last = d;
@@ -253,7 +261,7 @@ static int __init ic_open_devs(void)
253 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); 261 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
254 else 262 else
255 printk(KERN_ERR "IP-Config: No network devices available.\n"); 263 printk(KERN_ERR "IP-Config: No network devices available.\n");
256 return -1; 264 return -ENODEV;
257 } 265 }
258 return 0; 266 return 0;
259} 267}
@@ -1172,10 +1180,9 @@ static int __init ic_dynamic(void)
1172 schedule_timeout_uninterruptible(1); 1180 schedule_timeout_uninterruptible(1);
1173#ifdef IPCONFIG_DHCP 1181#ifdef IPCONFIG_DHCP
1174 /* DHCP isn't done until we get a DHCPACK. */ 1182 /* DHCP isn't done until we get a DHCPACK. */
1175 if ((ic_got_reply & IC_BOOTP) 1183 if ((ic_got_reply & IC_BOOTP) &&
1176 && (ic_proto_enabled & IC_USE_DHCP) 1184 (ic_proto_enabled & IC_USE_DHCP) &&
1177 && ic_dhcp_msgtype != DHCPACK) 1185 ic_dhcp_msgtype != DHCPACK) {
1178 {
1179 ic_got_reply = 0; 1186 ic_got_reply = 0;
1180 printk(","); 1187 printk(",");
1181 continue; 1188 continue;
@@ -1304,6 +1311,32 @@ __be32 __init root_nfs_parse_addr(char *name)
1304 return addr; 1311 return addr;
1305} 1312}
1306 1313
1314#define DEVICE_WAIT_MAX 12 /* 12 seconds */
1315
1316static int __init wait_for_devices(void)
1317{
1318 int i;
1319
1320 msleep(CONF_PRE_OPEN);
1321 for (i = 0; i < DEVICE_WAIT_MAX; i++) {
1322 struct net_device *dev;
1323 int found = 0;
1324
1325 rtnl_lock();
1326 for_each_netdev(&init_net, dev) {
1327 if (ic_device_match(dev)) {
1328 found = 1;
1329 break;
1330 }
1331 }
1332 rtnl_unlock();
1333 if (found)
1334 return 0;
1335 ssleep(1);
1336 }
1337 return -ENODEV;
1338}
1339
1307/* 1340/*
1308 * IP Autoconfig dispatcher. 1341 * IP Autoconfig dispatcher.
1309 */ 1342 */
@@ -1314,6 +1347,7 @@ static int __init ip_auto_config(void)
1314#ifdef IPCONFIG_DYNAMIC 1347#ifdef IPCONFIG_DYNAMIC
1315 int retries = CONF_OPEN_RETRIES; 1348 int retries = CONF_OPEN_RETRIES;
1316#endif 1349#endif
1350 int err;
1317 1351
1318#ifdef CONFIG_PROC_FS 1352#ifdef CONFIG_PROC_FS
1319 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); 1353 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1326,12 +1360,15 @@ static int __init ip_auto_config(void)
1326#ifdef IPCONFIG_DYNAMIC 1360#ifdef IPCONFIG_DYNAMIC
1327 try_try_again: 1361 try_try_again:
1328#endif 1362#endif
1329 /* Give hardware a chance to settle */ 1363 /* Wait for devices to appear */
1330 msleep(CONF_PRE_OPEN); 1364 err = wait_for_devices();
1365 if (err)
1366 return err;
1331 1367
1332 /* Setup all network devices */ 1368 /* Setup all network devices */
1333 if (ic_open_devs() < 0) 1369 err = ic_open_devs();
1334 return -1; 1370 if (err)
1371 return err;
1335 1372
1336 /* Give drivers a chance to settle */ 1373 /* Give drivers a chance to settle */
1337 ssleep(CONF_POST_OPEN); 1374 ssleep(CONF_POST_OPEN);
@@ -1344,9 +1381,9 @@ static int __init ip_auto_config(void)
1344 */ 1381 */
1345 if (ic_myaddr == NONE || 1382 if (ic_myaddr == NONE ||
1346#ifdef CONFIG_ROOT_NFS 1383#ifdef CONFIG_ROOT_NFS
1347 (root_server_addr == NONE 1384 (root_server_addr == NONE &&
1348 && ic_servaddr == NONE 1385 ic_servaddr == NONE &&
1349 && ROOT_DEV == Root_NFS) || 1386 ROOT_DEV == Root_NFS) ||
1350#endif 1387#endif
1351 ic_first_dev->next) { 1388 ic_first_dev->next) {
1352#ifdef IPCONFIG_DYNAMIC 1389#ifdef IPCONFIG_DYNAMIC
@@ -1447,7 +1484,7 @@ late_initcall(ip_auto_config);
1447 1484
1448/* 1485/*
1449 * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel 1486 * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
1450 * command line parameter. See Documentation/filesystems/nfsroot.txt. 1487 * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
1451 */ 1488 */
1452static int __init ic_proto_name(char *name) 1489static int __init ic_proto_name(char *name)
1453{ 1490{
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ae40ed1ba560..0b27b14dcc9d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -95,6 +95,7 @@
95#include <linux/module.h> 95#include <linux/module.h>
96#include <linux/types.h> 96#include <linux/types.h>
97#include <linux/kernel.h> 97#include <linux/kernel.h>
98#include <linux/slab.h>
98#include <asm/uaccess.h> 99#include <asm/uaccess.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/netdevice.h> 101#include <linux/netdevice.h>
@@ -119,7 +120,7 @@
119#define HASH_SIZE 16 120#define HASH_SIZE 16
120#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
121 122
122static int ipip_net_id; 123static int ipip_net_id __read_mostly;
123struct ipip_net { 124struct ipip_net {
124 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel *tunnels_r_l[HASH_SIZE];
125 struct ip_tunnel *tunnels_r[HASH_SIZE]; 126 struct ip_tunnel *tunnels_r[HASH_SIZE];
@@ -130,11 +131,16 @@ struct ipip_net {
130 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
131}; 132};
132 133
133static void ipip_fb_tunnel_init(struct net_device *dev);
134static void ipip_tunnel_init(struct net_device *dev); 134static void ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136 136
137static DEFINE_RWLOCK(ipip_lock); 137/*
138 * Locking : hash tables are protected by RCU and a spinlock
139 */
140static DEFINE_SPINLOCK(ipip_lock);
141
142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
138 144
139static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 145static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
140 __be32 remote, __be32 local) 146 __be32 remote, __be32 local)
@@ -144,20 +150,21 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
144 struct ip_tunnel *t; 150 struct ip_tunnel *t;
145 struct ipip_net *ipn = net_generic(net, ipip_net_id); 151 struct ipip_net *ipn = net_generic(net, ipip_net_id);
146 152
147 for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { 153 for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
148 if (local == t->parms.iph.saddr && 154 if (local == t->parms.iph.saddr &&
149 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 155 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
150 return t; 156 return t;
151 } 157
152 for (t = ipn->tunnels_r[h0]; t; t = t->next) { 158 for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
153 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 159 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
154 return t; 160 return t;
155 } 161
156 for (t = ipn->tunnels_l[h1]; t; t = t->next) { 162 for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
157 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) 163 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
158 return t; 164 return t;
159 } 165
160 if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) 166 t = rcu_dereference(ipn->tunnels_wc[0]);
167 if (t && (t->dev->flags&IFF_UP))
161 return t; 168 return t;
162 return NULL; 169 return NULL;
163} 170}
@@ -193,9 +200,9 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
193 200
194 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 201 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
195 if (t == *tp) { 202 if (t == *tp) {
196 write_lock_bh(&ipip_lock); 203 spin_lock_bh(&ipip_lock);
197 *tp = t->next; 204 *tp = t->next;
198 write_unlock_bh(&ipip_lock); 205 spin_unlock_bh(&ipip_lock);
199 break; 206 break;
200 } 207 }
201 } 208 }
@@ -205,10 +212,10 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
205{ 212{
206 struct ip_tunnel **tp = ipip_bucket(ipn, t); 213 struct ip_tunnel **tp = ipip_bucket(ipn, t);
207 214
215 spin_lock_bh(&ipip_lock);
208 t->next = *tp; 216 t->next = *tp;
209 write_lock_bh(&ipip_lock); 217 rcu_assign_pointer(*tp, t);
210 *tp = t; 218 spin_unlock_bh(&ipip_lock);
211 write_unlock_bh(&ipip_lock);
212} 219}
213 220
214static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 221static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -267,9 +274,9 @@ static void ipip_tunnel_uninit(struct net_device *dev)
267 struct ipip_net *ipn = net_generic(net, ipip_net_id); 274 struct ipip_net *ipn = net_generic(net, ipip_net_id);
268 275
269 if (dev == ipn->fb_tunnel_dev) { 276 if (dev == ipn->fb_tunnel_dev) {
270 write_lock_bh(&ipip_lock); 277 spin_lock_bh(&ipip_lock);
271 ipn->tunnels_wc[0] = NULL; 278 ipn->tunnels_wc[0] = NULL;
272 write_unlock_bh(&ipip_lock); 279 spin_unlock_bh(&ipip_lock);
273 } else 280 } else
274 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 281 ipip_tunnel_unlink(ipn, netdev_priv(dev));
275 dev_put(dev); 282 dev_put(dev);
@@ -318,7 +325,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
318 325
319 err = -ENOENT; 326 err = -ENOENT;
320 327
321 read_lock(&ipip_lock); 328 rcu_read_lock();
322 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 329 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
323 if (t == NULL || t->parms.iph.daddr == 0) 330 if (t == NULL || t->parms.iph.daddr == 0)
324 goto out; 331 goto out;
@@ -333,7 +340,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
333 t->err_count = 1; 340 t->err_count = 1;
334 t->err_time = jiffies; 341 t->err_time = jiffies;
335out: 342out:
336 read_unlock(&ipip_lock); 343 rcu_read_unlock();
337 return err; 344 return err;
338} 345}
339 346
@@ -351,11 +358,11 @@ static int ipip_rcv(struct sk_buff *skb)
351 struct ip_tunnel *tunnel; 358 struct ip_tunnel *tunnel;
352 const struct iphdr *iph = ip_hdr(skb); 359 const struct iphdr *iph = ip_hdr(skb);
353 360
354 read_lock(&ipip_lock); 361 rcu_read_lock();
355 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 362 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
356 iph->saddr, iph->daddr)) != NULL) { 363 iph->saddr, iph->daddr)) != NULL) {
357 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 364 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
358 read_unlock(&ipip_lock); 365 rcu_read_unlock();
359 kfree_skb(skb); 366 kfree_skb(skb);
360 return 0; 367 return 0;
361 } 368 }
@@ -374,10 +381,10 @@ static int ipip_rcv(struct sk_buff *skb)
374 nf_reset(skb); 381 nf_reset(skb);
375 ipip_ecn_decapsulate(iph, skb); 382 ipip_ecn_decapsulate(iph, skb);
376 netif_rx(skb); 383 netif_rx(skb);
377 read_unlock(&ipip_lock); 384 rcu_read_unlock();
378 return 0; 385 return 0;
379 } 386 }
380 read_unlock(&ipip_lock); 387 rcu_read_unlock();
381 388
382 return -1; 389 return -1;
383} 390}
@@ -390,7 +397,8 @@ static int ipip_rcv(struct sk_buff *skb)
390static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 397static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
391{ 398{
392 struct ip_tunnel *tunnel = netdev_priv(dev); 399 struct ip_tunnel *tunnel = netdev_priv(dev);
393 struct net_device_stats *stats = &tunnel->dev->stats; 400 struct net_device_stats *stats = &dev->stats;
401 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
394 struct iphdr *tiph = &tunnel->parms.iph; 402 struct iphdr *tiph = &tunnel->parms.iph;
395 u8 tos = tunnel->parms.iph.tos; 403 u8 tos = tunnel->parms.iph.tos;
396 __be16 df = tiph->frag_off; 404 __be16 df = tiph->frag_off;
@@ -480,7 +488,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
480 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 488 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
481 if (!new_skb) { 489 if (!new_skb) {
482 ip_rt_put(rt); 490 ip_rt_put(rt);
483 stats->tx_dropped++; 491 txq->tx_dropped++;
484 dev_kfree_skb(skb); 492 dev_kfree_skb(skb);
485 return NETDEV_TX_OK; 493 return NETDEV_TX_OK;
486 } 494 }
@@ -722,7 +730,7 @@ static void ipip_tunnel_init(struct net_device *dev)
722 ipip_tunnel_bind_dev(dev); 730 ipip_tunnel_bind_dev(dev);
723} 731}
724 732
725static void ipip_fb_tunnel_init(struct net_device *dev) 733static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
726{ 734{
727 struct ip_tunnel *tunnel = netdev_priv(dev); 735 struct ip_tunnel *tunnel = netdev_priv(dev);
728 struct iphdr *iph = &tunnel->parms.iph; 736 struct iphdr *iph = &tunnel->parms.iph;
@@ -748,33 +756,27 @@ static struct xfrm_tunnel ipip_handler = {
748static const char banner[] __initconst = 756static const char banner[] __initconst =
749 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 757 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
750 758
751static void ipip_destroy_tunnels(struct ipip_net *ipn) 759static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
752{ 760{
753 int prio; 761 int prio;
754 762
755 for (prio = 1; prio < 4; prio++) { 763 for (prio = 1; prio < 4; prio++) {
756 int h; 764 int h;
757 for (h = 0; h < HASH_SIZE; h++) { 765 for (h = 0; h < HASH_SIZE; h++) {
758 struct ip_tunnel *t; 766 struct ip_tunnel *t = ipn->tunnels[prio][h];
759 while ((t = ipn->tunnels[prio][h]) != NULL) 767
760 unregister_netdevice(t->dev); 768 while (t != NULL) {
769 unregister_netdevice_queue(t->dev, head);
770 t = t->next;
771 }
761 } 772 }
762 } 773 }
763} 774}
764 775
765static int ipip_init_net(struct net *net) 776static int __net_init ipip_init_net(struct net *net)
766{ 777{
778 struct ipip_net *ipn = net_generic(net, ipip_net_id);
767 int err; 779 int err;
768 struct ipip_net *ipn;
769
770 err = -ENOMEM;
771 ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
772 if (ipn == NULL)
773 goto err_alloc;
774
775 err = net_assign_generic(net, ipip_net_id, ipn);
776 if (err < 0)
777 goto err_assign;
778 780
779 ipn->tunnels[0] = ipn->tunnels_wc; 781 ipn->tunnels[0] = ipn->tunnels_wc;
780 ipn->tunnels[1] = ipn->tunnels_l; 782 ipn->tunnels[1] = ipn->tunnels_l;
@@ -801,27 +803,26 @@ err_reg_dev:
801 free_netdev(ipn->fb_tunnel_dev); 803 free_netdev(ipn->fb_tunnel_dev);
802err_alloc_dev: 804err_alloc_dev:
803 /* nothing */ 805 /* nothing */
804err_assign:
805 kfree(ipn);
806err_alloc:
807 return err; 806 return err;
808} 807}
809 808
810static void ipip_exit_net(struct net *net) 809static void __net_exit ipip_exit_net(struct net *net)
811{ 810{
812 struct ipip_net *ipn; 811 struct ipip_net *ipn = net_generic(net, ipip_net_id);
812 LIST_HEAD(list);
813 813
814 ipn = net_generic(net, ipip_net_id);
815 rtnl_lock(); 814 rtnl_lock();
816 ipip_destroy_tunnels(ipn); 815 ipip_destroy_tunnels(ipn, &list);
817 unregister_netdevice(ipn->fb_tunnel_dev); 816 unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
817 unregister_netdevice_many(&list);
818 rtnl_unlock(); 818 rtnl_unlock();
819 kfree(ipn);
820} 819}
821 820
822static struct pernet_operations ipip_net_ops = { 821static struct pernet_operations ipip_net_ops = {
823 .init = ipip_init_net, 822 .init = ipip_init_net,
824 .exit = ipip_exit_net, 823 .exit = ipip_exit_net,
824 .id = &ipip_net_id,
825 .size = sizeof(struct ipip_net),
825}; 826};
826 827
827static int __init ipip_init(void) 828static int __init ipip_init(void)
@@ -830,15 +831,14 @@ static int __init ipip_init(void)
830 831
831 printk(banner); 832 printk(banner);
832 833
833 if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { 834 err = register_pernet_device(&ipip_net_ops);
835 if (err < 0)
836 return err;
837 err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
838 if (err < 0) {
839 unregister_pernet_device(&ipip_net_ops);
834 printk(KERN_INFO "ipip init: can't register tunnel\n"); 840 printk(KERN_INFO "ipip init: can't register tunnel\n");
835 return -EAGAIN;
836 } 841 }
837
838 err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
839 if (err)
840 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
841
842 return err; 842 return err;
843} 843}
844 844
@@ -847,7 +847,7 @@ static void __exit ipip_fini(void)
847 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) 847 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
848 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 848 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
849 849
850 unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); 850 unregister_pernet_device(&ipip_net_ops);
851} 851}
852 852
853module_init(ipip_init); 853module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 99508d66a642..ec19a890c9a0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -47,6 +47,7 @@
47#include <linux/mroute.h> 47#include <linux/mroute.h>
48#include <linux/init.h> 48#include <linux/init.h>
49#include <linux/if_ether.h> 49#include <linux/if_ether.h>
50#include <linux/slab.h>
50#include <net/net_namespace.h> 51#include <net/net_namespace.h>
51#include <net/ip.h> 52#include <net/ip.h>
52#include <net/protocol.h> 53#include <net/protocol.h>
@@ -275,7 +276,8 @@ failure:
275 * @notify: Set to 1, if the caller is a notifier_call 276 * @notify: Set to 1, if the caller is a notifier_call
276 */ 277 */
277 278
278static int vif_delete(struct net *net, int vifi, int notify) 279static int vif_delete(struct net *net, int vifi, int notify,
280 struct list_head *head)
279{ 281{
280 struct vif_device *v; 282 struct vif_device *v;
281 struct net_device *dev; 283 struct net_device *dev;
@@ -319,7 +321,7 @@ static int vif_delete(struct net *net, int vifi, int notify)
319 } 321 }
320 322
321 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 323 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
322 unregister_netdevice(dev); 324 unregister_netdevice_queue(dev, head);
323 325
324 dev_put(dev); 326 dev_put(dev);
325 return 0; 327 return 0;
@@ -469,8 +471,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
469 return err; 471 return err;
470 } 472 }
471 break; 473 break;
474
475 case VIFF_USE_IFINDEX:
472 case 0: 476 case 0:
473 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 477 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
478 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
479 if (dev && dev->ip_ptr == NULL) {
480 dev_put(dev);
481 return -EADDRNOTAVAIL;
482 }
483 } else
484 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
485
474 if (!dev) 486 if (!dev)
475 return -EADDRNOTAVAIL; 487 return -EADDRNOTAVAIL;
476 err = dev_set_allmulti(dev, 1); 488 err = dev_set_allmulti(dev, 1);
@@ -742,7 +754,8 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
742 c->next = mfc_unres_queue; 754 c->next = mfc_unres_queue;
743 mfc_unres_queue = c; 755 mfc_unres_queue = c;
744 756
745 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); 757 if (atomic_read(&net->ipv4.cache_resolve_queue_len) == 1)
758 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
746 } 759 }
747 760
748 /* 761 /*
@@ -791,6 +804,9 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
791 int line; 804 int line;
792 struct mfc_cache *uc, *c, **cp; 805 struct mfc_cache *uc, *c, **cp;
793 806
807 if (mfc->mfcc_parent >= MAXVIFS)
808 return -ENFILE;
809
794 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 810 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
795 811
796 for (cp = &net->ipv4.mfc_cache_array[line]; 812 for (cp = &net->ipv4.mfc_cache_array[line];
@@ -862,14 +878,16 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
862static void mroute_clean_tables(struct net *net) 878static void mroute_clean_tables(struct net *net)
863{ 879{
864 int i; 880 int i;
881 LIST_HEAD(list);
865 882
866 /* 883 /*
867 * Shut down all active vif entries 884 * Shut down all active vif entries
868 */ 885 */
869 for (i = 0; i < net->ipv4.maxvif; i++) { 886 for (i = 0; i < net->ipv4.maxvif; i++) {
870 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) 887 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
871 vif_delete(net, i, 0); 888 vif_delete(net, i, 0, &list);
872 } 889 }
890 unregister_netdevice_many(&list);
873 891
874 /* 892 /*
875 * Wipe the cache 893 * Wipe the cache
@@ -948,7 +966,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
948 switch (optname) { 966 switch (optname) {
949 case MRT_INIT: 967 case MRT_INIT:
950 if (sk->sk_type != SOCK_RAW || 968 if (sk->sk_type != SOCK_RAW ||
951 inet_sk(sk)->num != IPPROTO_IGMP) 969 inet_sk(sk)->inet_num != IPPROTO_IGMP)
952 return -EOPNOTSUPP; 970 return -EOPNOTSUPP;
953 if (optlen != sizeof(int)) 971 if (optlen != sizeof(int))
954 return -ENOPROTOOPT; 972 return -ENOPROTOOPT;
@@ -985,7 +1003,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
985 if (optname == MRT_ADD_VIF) { 1003 if (optname == MRT_ADD_VIF) {
986 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); 1004 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
987 } else { 1005 } else {
988 ret = vif_delete(net, vif.vifc_vifi, 0); 1006 ret = vif_delete(net, vif.vifc_vifi, 0, NULL);
989 } 1007 }
990 rtnl_unlock(); 1008 rtnl_unlock();
991 return ret; 1009 return ret;
@@ -1148,17 +1166,16 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1148 struct net *net = dev_net(dev); 1166 struct net *net = dev_net(dev);
1149 struct vif_device *v; 1167 struct vif_device *v;
1150 int ct; 1168 int ct;
1151 1169 LIST_HEAD(list);
1152 if (!net_eq(dev_net(dev), net))
1153 return NOTIFY_DONE;
1154 1170
1155 if (event != NETDEV_UNREGISTER) 1171 if (event != NETDEV_UNREGISTER)
1156 return NOTIFY_DONE; 1172 return NOTIFY_DONE;
1157 v = &net->ipv4.vif_table[0]; 1173 v = &net->ipv4.vif_table[0];
1158 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { 1174 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
1159 if (v->dev == dev) 1175 if (v->dev == dev)
1160 vif_delete(net, ct, 1); 1176 vif_delete(net, ct, 1, &list);
1161 } 1177 }
1178 unregister_netdevice_many(&list);
1162 return NOTIFY_DONE; 1179 return NOTIFY_DONE;
1163} 1180}
1164 1181
@@ -1601,17 +1618,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1601 int ct; 1618 int ct;
1602 struct rtnexthop *nhp; 1619 struct rtnexthop *nhp;
1603 struct net *net = mfc_net(c); 1620 struct net *net = mfc_net(c);
1604 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
1605 u8 *b = skb_tail_pointer(skb); 1621 u8 *b = skb_tail_pointer(skb);
1606 struct rtattr *mp_head; 1622 struct rtattr *mp_head;
1607 1623
1608 if (dev) 1624 /* If cache is unresolved, don't try to parse IIF and OIF */
1609 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); 1625 if (c->mfc_parent > MAXVIFS)
1626 return -ENOENT;
1627
1628 if (VIF_EXISTS(net, c->mfc_parent))
1629 RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex);
1610 1630
1611 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); 1631 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1612 1632
1613 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 1633 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1614 if (c->mfc_un.res.ttls[ct] < 255) { 1634 if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) {
1615 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 1635 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1616 goto rtattr_failure; 1636 goto rtattr_failure;
1617 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 1637 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 1725dc0ef688..82fb43c5c59e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -4,6 +4,7 @@
4#include <linux/netfilter_ipv4.h> 4#include <linux/netfilter_ipv4.h>
5#include <linux/ip.h> 5#include <linux/ip.h>
6#include <linux/skbuff.h> 6#include <linux/skbuff.h>
7#include <linux/gfp.h>
7#include <net/route.h> 8#include <net/route.h>
8#include <net/xfrm.h> 9#include <net/xfrm.h>
9#include <net/ip.h> 10#include <net/ip.h>
@@ -155,10 +156,10 @@ static int nf_ip_reroute(struct sk_buff *skb,
155 if (entry->hook == NF_INET_LOCAL_OUT) { 156 if (entry->hook == NF_INET_LOCAL_OUT) {
156 const struct iphdr *iph = ip_hdr(skb); 157 const struct iphdr *iph = ip_hdr(skb);
157 158
158 if (!(iph->tos == rt_info->tos 159 if (!(iph->tos == rt_info->tos &&
159 && skb->mark == rt_info->mark 160 skb->mark == rt_info->mark &&
160 && iph->daddr == rt_info->daddr 161 iph->daddr == rt_info->daddr &&
161 && iph->saddr == rt_info->saddr)) 162 iph->saddr == rt_info->saddr))
162 return ip_route_me_harder(skb, RTN_UNSPEC); 163 return ip_route_me_harder(skb, RTN_UNSPEC);
163 } 164 }
164 return 0; 165 return 0;
@@ -248,9 +249,9 @@ module_exit(ipv4_netfilter_fini);
248 249
249#ifdef CONFIG_SYSCTL 250#ifdef CONFIG_SYSCTL
250struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { 251struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
251 { .procname = "net", .ctl_name = CTL_NET, }, 252 { .procname = "net", },
252 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 253 { .procname = "ipv4", },
253 { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, }, 254 { .procname = "netfilter", },
254 { } 255 { }
255}; 256};
256EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); 257EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 27774c99d888..f07d77f65751 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -27,6 +27,7 @@
27 27
28#include <linux/netfilter/x_tables.h> 28#include <linux/netfilter/x_tables.h>
29#include <linux/netfilter_arp/arp_tables.h> 29#include <linux/netfilter_arp/arp_tables.h>
30#include "../../netfilter/xt_repldata.h"
30 31
31MODULE_LICENSE("GPL"); 32MODULE_LICENSE("GPL");
32MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); 33MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -58,6 +59,12 @@ do { \
58#define ARP_NF_ASSERT(x) 59#define ARP_NF_ASSERT(x)
59#endif 60#endif
60 61
62void *arpt_alloc_initial_table(const struct xt_table *info)
63{
64 return xt_alloc_initial_table(arpt, ARPT);
65}
66EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
67
61static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, 68static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
62 const char *hdr_addr, int len) 69 const char *hdr_addr, int len)
63{ 70{
@@ -226,7 +233,14 @@ arpt_error(struct sk_buff *skb, const struct xt_target_param *par)
226 return NF_DROP; 233 return NF_DROP;
227} 234}
228 235
229static inline struct arpt_entry *get_entry(void *base, unsigned int offset) 236static inline const struct arpt_entry_target *
237arpt_get_target_c(const struct arpt_entry *e)
238{
239 return arpt_get_target((struct arpt_entry *)e);
240}
241
242static inline struct arpt_entry *
243get_entry(const void *base, unsigned int offset)
230{ 244{
231 return (struct arpt_entry *)(base + offset); 245 return (struct arpt_entry *)(base + offset);
232} 246}
@@ -273,7 +287,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
273 287
274 arp = arp_hdr(skb); 288 arp = arp_hdr(skb);
275 do { 289 do {
276 struct arpt_entry_target *t; 290 const struct arpt_entry_target *t;
277 int hdr_len; 291 int hdr_len;
278 292
279 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 293 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
@@ -285,7 +299,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
285 (2 * skb->dev->addr_len); 299 (2 * skb->dev->addr_len);
286 ADD_COUNTER(e->counters, hdr_len, 1); 300 ADD_COUNTER(e->counters, hdr_len, 1);
287 301
288 t = arpt_get_target(e); 302 t = arpt_get_target_c(e);
289 303
290 /* Standard target? */ 304 /* Standard target? */
291 if (!t->u.kernel.target->target) { 305 if (!t->u.kernel.target->target) {
@@ -351,7 +365,7 @@ static inline bool unconditional(const struct arpt_arp *arp)
351/* Figures out from what hook each rule can be called: returns 0 if 365/* Figures out from what hook each rule can be called: returns 0 if
352 * there are loops. Puts hook bitmask in comefrom. 366 * there are loops. Puts hook bitmask in comefrom.
353 */ 367 */
354static int mark_source_chains(struct xt_table_info *newinfo, 368static int mark_source_chains(const struct xt_table_info *newinfo,
355 unsigned int valid_hooks, void *entry0) 369 unsigned int valid_hooks, void *entry0)
356{ 370{
357 unsigned int hook; 371 unsigned int hook;
@@ -372,7 +386,7 @@ static int mark_source_chains(struct xt_table_info *newinfo,
372 386
373 for (;;) { 387 for (;;) {
374 const struct arpt_standard_target *t 388 const struct arpt_standard_target *t
375 = (void *)arpt_get_target(e); 389 = (void *)arpt_get_target_c(e);
376 int visited = e->comefrom & (1 << hook); 390 int visited = e->comefrom & (1 << hook);
377 391
378 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { 392 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
@@ -384,11 +398,11 @@ static int mark_source_chains(struct xt_table_info *newinfo,
384 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); 398 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
385 399
386 /* Unconditional return/END. */ 400 /* Unconditional return/END. */
387 if ((e->target_offset == sizeof(struct arpt_entry) 401 if ((e->target_offset == sizeof(struct arpt_entry) &&
388 && (strcmp(t->target.u.user.name, 402 (strcmp(t->target.u.user.name,
389 ARPT_STANDARD_TARGET) == 0) 403 ARPT_STANDARD_TARGET) == 0) &&
390 && t->verdict < 0 404 t->verdict < 0 && unconditional(&e->arp)) ||
391 && unconditional(&e->arp)) || visited) { 405 visited) {
392 unsigned int oldpos, size; 406 unsigned int oldpos, size;
393 407
394 if ((strcmp(t->target.u.user.name, 408 if ((strcmp(t->target.u.user.name,
@@ -427,8 +441,8 @@ static int mark_source_chains(struct xt_table_info *newinfo,
427 int newpos = t->verdict; 441 int newpos = t->verdict;
428 442
429 if (strcmp(t->target.u.user.name, 443 if (strcmp(t->target.u.user.name,
430 ARPT_STANDARD_TARGET) == 0 444 ARPT_STANDARD_TARGET) == 0 &&
431 && newpos >= 0) { 445 newpos >= 0) {
432 if (newpos > newinfo->size - 446 if (newpos > newinfo->size -
433 sizeof(struct arpt_entry)) { 447 sizeof(struct arpt_entry)) {
434 duprintf("mark_source_chains: " 448 duprintf("mark_source_chains: "
@@ -456,7 +470,7 @@ static int mark_source_chains(struct xt_table_info *newinfo,
456 return 1; 470 return 1;
457} 471}
458 472
459static inline int check_entry(struct arpt_entry *e, const char *name) 473static inline int check_entry(const struct arpt_entry *e, const char *name)
460{ 474{
461 const struct arpt_entry_target *t; 475 const struct arpt_entry_target *t;
462 476
@@ -468,7 +482,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name)
468 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) 482 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
469 return -EINVAL; 483 return -EINVAL;
470 484
471 t = arpt_get_target(e); 485 t = arpt_get_target_c(e);
472 if (e->target_offset + t->u.target_size > e->next_offset) 486 if (e->target_offset + t->u.target_size > e->next_offset)
473 return -EINVAL; 487 return -EINVAL;
474 488
@@ -498,8 +512,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
498} 512}
499 513
500static inline int 514static inline int
501find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, 515find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
502 unsigned int *i)
503{ 516{
504 struct arpt_entry_target *t; 517 struct arpt_entry_target *t;
505 struct xt_target *target; 518 struct xt_target *target;
@@ -524,8 +537,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
524 ret = check_target(e, name); 537 ret = check_target(e, name);
525 if (ret) 538 if (ret)
526 goto err; 539 goto err;
527
528 (*i)++;
529 return 0; 540 return 0;
530err: 541err:
531 module_put(t->u.kernel.target->me); 542 module_put(t->u.kernel.target->me);
@@ -533,14 +544,14 @@ out:
533 return ret; 544 return ret;
534} 545}
535 546
536static bool check_underflow(struct arpt_entry *e) 547static bool check_underflow(const struct arpt_entry *e)
537{ 548{
538 const struct arpt_entry_target *t; 549 const struct arpt_entry_target *t;
539 unsigned int verdict; 550 unsigned int verdict;
540 551
541 if (!unconditional(&e->arp)) 552 if (!unconditional(&e->arp))
542 return false; 553 return false;
543 t = arpt_get_target(e); 554 t = arpt_get_target_c(e);
544 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 555 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
545 return false; 556 return false;
546 verdict = ((struct arpt_standard_target *)t)->verdict; 557 verdict = ((struct arpt_standard_target *)t)->verdict;
@@ -550,17 +561,16 @@ static bool check_underflow(struct arpt_entry *e)
550 561
551static inline int check_entry_size_and_hooks(struct arpt_entry *e, 562static inline int check_entry_size_and_hooks(struct arpt_entry *e,
552 struct xt_table_info *newinfo, 563 struct xt_table_info *newinfo,
553 unsigned char *base, 564 const unsigned char *base,
554 unsigned char *limit, 565 const unsigned char *limit,
555 const unsigned int *hook_entries, 566 const unsigned int *hook_entries,
556 const unsigned int *underflows, 567 const unsigned int *underflows,
557 unsigned int valid_hooks, 568 unsigned int valid_hooks)
558 unsigned int *i)
559{ 569{
560 unsigned int h; 570 unsigned int h;
561 571
562 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 572 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
563 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { 573 (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
564 duprintf("Bad offset %p\n", e); 574 duprintf("Bad offset %p\n", e);
565 return -EINVAL; 575 return -EINVAL;
566 } 576 }
@@ -592,19 +602,14 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
592 /* Clear counters and comefrom */ 602 /* Clear counters and comefrom */
593 e->counters = ((struct xt_counters) { 0, 0 }); 603 e->counters = ((struct xt_counters) { 0, 0 });
594 e->comefrom = 0; 604 e->comefrom = 0;
595
596 (*i)++;
597 return 0; 605 return 0;
598} 606}
599 607
600static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) 608static inline void cleanup_entry(struct arpt_entry *e)
601{ 609{
602 struct xt_tgdtor_param par; 610 struct xt_tgdtor_param par;
603 struct arpt_entry_target *t; 611 struct arpt_entry_target *t;
604 612
605 if (i && (*i)-- == 0)
606 return 1;
607
608 t = arpt_get_target(e); 613 t = arpt_get_target(e);
609 par.target = t->u.kernel.target; 614 par.target = t->u.kernel.target;
610 par.targinfo = t->data; 615 par.targinfo = t->data;
@@ -612,26 +617,20 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
612 if (par.target->destroy != NULL) 617 if (par.target->destroy != NULL)
613 par.target->destroy(&par); 618 par.target->destroy(&par);
614 module_put(par.target->me); 619 module_put(par.target->me);
615 return 0;
616} 620}
617 621
618/* Checks and translates the user-supplied table segment (held in 622/* Checks and translates the user-supplied table segment (held in
619 * newinfo). 623 * newinfo).
620 */ 624 */
621static int translate_table(const char *name, 625static int translate_table(struct xt_table_info *newinfo, void *entry0,
622 unsigned int valid_hooks, 626 const struct arpt_replace *repl)
623 struct xt_table_info *newinfo,
624 void *entry0,
625 unsigned int size,
626 unsigned int number,
627 const unsigned int *hook_entries,
628 const unsigned int *underflows)
629{ 627{
628 struct arpt_entry *iter;
630 unsigned int i; 629 unsigned int i;
631 int ret; 630 int ret = 0;
632 631
633 newinfo->size = size; 632 newinfo->size = repl->size;
634 newinfo->number = number; 633 newinfo->number = repl->num_entries;
635 634
636 /* Init all hooks to impossible value. */ 635 /* Init all hooks to impossible value. */
637 for (i = 0; i < NF_ARP_NUMHOOKS; i++) { 636 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
@@ -643,52 +642,63 @@ static int translate_table(const char *name,
643 i = 0; 642 i = 0;
644 643
645 /* Walk through entries, checking offsets. */ 644 /* Walk through entries, checking offsets. */
646 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, 645 xt_entry_foreach(iter, entry0, newinfo->size) {
647 check_entry_size_and_hooks, 646 ret = check_entry_size_and_hooks(iter, newinfo, entry0,
648 newinfo, 647 entry0 + repl->size,
649 entry0, 648 repl->hook_entry,
650 entry0 + size, 649 repl->underflow,
651 hook_entries, underflows, valid_hooks, &i); 650 repl->valid_hooks);
651 if (ret != 0)
652 break;
653 ++i;
654 }
652 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); 655 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
653 if (ret != 0) 656 if (ret != 0)
654 return ret; 657 return ret;
655 658
656 if (i != number) { 659 if (i != repl->num_entries) {
657 duprintf("translate_table: %u not %u entries\n", 660 duprintf("translate_table: %u not %u entries\n",
658 i, number); 661 i, repl->num_entries);
659 return -EINVAL; 662 return -EINVAL;
660 } 663 }
661 664
662 /* Check hooks all assigned */ 665 /* Check hooks all assigned */
663 for (i = 0; i < NF_ARP_NUMHOOKS; i++) { 666 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
664 /* Only hooks which are valid */ 667 /* Only hooks which are valid */
665 if (!(valid_hooks & (1 << i))) 668 if (!(repl->valid_hooks & (1 << i)))
666 continue; 669 continue;
667 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { 670 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
668 duprintf("Invalid hook entry %u %u\n", 671 duprintf("Invalid hook entry %u %u\n",
669 i, hook_entries[i]); 672 i, repl->hook_entry[i]);
670 return -EINVAL; 673 return -EINVAL;
671 } 674 }
672 if (newinfo->underflow[i] == 0xFFFFFFFF) { 675 if (newinfo->underflow[i] == 0xFFFFFFFF) {
673 duprintf("Invalid underflow %u %u\n", 676 duprintf("Invalid underflow %u %u\n",
674 i, underflows[i]); 677 i, repl->underflow[i]);
675 return -EINVAL; 678 return -EINVAL;
676 } 679 }
677 } 680 }
678 681
679 if (!mark_source_chains(newinfo, valid_hooks, entry0)) { 682 if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
680 duprintf("Looping hook\n"); 683 duprintf("Looping hook\n");
681 return -ELOOP; 684 return -ELOOP;
682 } 685 }
683 686
684 /* Finally, each sanity check must pass */ 687 /* Finally, each sanity check must pass */
685 i = 0; 688 i = 0;
686 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, 689 xt_entry_foreach(iter, entry0, newinfo->size) {
687 find_check_entry, name, size, &i); 690 ret = find_check_entry(iter, repl->name, repl->size);
691 if (ret != 0)
692 break;
693 ++i;
694 }
688 695
689 if (ret != 0) { 696 if (ret != 0) {
690 ARPT_ENTRY_ITERATE(entry0, newinfo->size, 697 xt_entry_foreach(iter, entry0, newinfo->size) {
691 cleanup_entry, &i); 698 if (i-- == 0)
699 break;
700 cleanup_entry(iter);
701 }
692 return ret; 702 return ret;
693 } 703 }
694 704
@@ -701,30 +711,10 @@ static int translate_table(const char *name,
701 return ret; 711 return ret;
702} 712}
703 713
704/* Gets counters. */
705static inline int add_entry_to_counter(const struct arpt_entry *e,
706 struct xt_counters total[],
707 unsigned int *i)
708{
709 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
710
711 (*i)++;
712 return 0;
713}
714
715static inline int set_entry_to_counter(const struct arpt_entry *e,
716 struct xt_counters total[],
717 unsigned int *i)
718{
719 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
720
721 (*i)++;
722 return 0;
723}
724
725static void get_counters(const struct xt_table_info *t, 714static void get_counters(const struct xt_table_info *t,
726 struct xt_counters counters[]) 715 struct xt_counters counters[])
727{ 716{
717 struct arpt_entry *iter;
728 unsigned int cpu; 718 unsigned int cpu;
729 unsigned int i; 719 unsigned int i;
730 unsigned int curcpu; 720 unsigned int curcpu;
@@ -740,32 +730,32 @@ static void get_counters(const struct xt_table_info *t,
740 curcpu = smp_processor_id(); 730 curcpu = smp_processor_id();
741 731
742 i = 0; 732 i = 0;
743 ARPT_ENTRY_ITERATE(t->entries[curcpu], 733 xt_entry_foreach(iter, t->entries[curcpu], t->size) {
744 t->size, 734 SET_COUNTER(counters[i], iter->counters.bcnt,
745 set_entry_to_counter, 735 iter->counters.pcnt);
746 counters, 736 ++i;
747 &i); 737 }
748 738
749 for_each_possible_cpu(cpu) { 739 for_each_possible_cpu(cpu) {
750 if (cpu == curcpu) 740 if (cpu == curcpu)
751 continue; 741 continue;
752 i = 0; 742 i = 0;
753 xt_info_wrlock(cpu); 743 xt_info_wrlock(cpu);
754 ARPT_ENTRY_ITERATE(t->entries[cpu], 744 xt_entry_foreach(iter, t->entries[cpu], t->size) {
755 t->size, 745 ADD_COUNTER(counters[i], iter->counters.bcnt,
756 add_entry_to_counter, 746 iter->counters.pcnt);
757 counters, 747 ++i;
758 &i); 748 }
759 xt_info_wrunlock(cpu); 749 xt_info_wrunlock(cpu);
760 } 750 }
761 local_bh_enable(); 751 local_bh_enable();
762} 752}
763 753
764static struct xt_counters *alloc_counters(struct xt_table *table) 754static struct xt_counters *alloc_counters(const struct xt_table *table)
765{ 755{
766 unsigned int countersize; 756 unsigned int countersize;
767 struct xt_counters *counters; 757 struct xt_counters *counters;
768 struct xt_table_info *private = table->private; 758 const struct xt_table_info *private = table->private;
769 759
770 /* We need atomic snapshot of counters: rest doesn't change 760 /* We need atomic snapshot of counters: rest doesn't change
771 * (other than comefrom, which userspace doesn't care 761 * (other than comefrom, which userspace doesn't care
@@ -783,11 +773,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
783} 773}
784 774
785static int copy_entries_to_user(unsigned int total_size, 775static int copy_entries_to_user(unsigned int total_size,
786 struct xt_table *table, 776 const struct xt_table *table,
787 void __user *userptr) 777 void __user *userptr)
788{ 778{
789 unsigned int off, num; 779 unsigned int off, num;
790 struct arpt_entry *e; 780 const struct arpt_entry *e;
791 struct xt_counters *counters; 781 struct xt_counters *counters;
792 struct xt_table_info *private = table->private; 782 struct xt_table_info *private = table->private;
793 int ret = 0; 783 int ret = 0;
@@ -807,7 +797,7 @@ static int copy_entries_to_user(unsigned int total_size,
807 /* FIXME: use iterator macros --RR */ 797 /* FIXME: use iterator macros --RR */
808 /* ... then go back and fix counters and names */ 798 /* ... then go back and fix counters and names */
809 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 799 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
810 struct arpt_entry_target *t; 800 const struct arpt_entry_target *t;
811 801
812 e = (struct arpt_entry *)(loc_cpu_entry + off); 802 e = (struct arpt_entry *)(loc_cpu_entry + off);
813 if (copy_to_user(userptr + off 803 if (copy_to_user(userptr + off
@@ -818,7 +808,7 @@ static int copy_entries_to_user(unsigned int total_size,
818 goto free_counters; 808 goto free_counters;
819 } 809 }
820 810
821 t = arpt_get_target(e); 811 t = arpt_get_target_c(e);
822 if (copy_to_user(userptr + off + e->target_offset 812 if (copy_to_user(userptr + off + e->target_offset
823 + offsetof(struct arpt_entry_target, 813 + offsetof(struct arpt_entry_target,
824 u.user.name), 814 u.user.name),
@@ -835,7 +825,7 @@ static int copy_entries_to_user(unsigned int total_size,
835} 825}
836 826
837#ifdef CONFIG_COMPAT 827#ifdef CONFIG_COMPAT
838static void compat_standard_from_user(void *dst, void *src) 828static void compat_standard_from_user(void *dst, const void *src)
839{ 829{
840 int v = *(compat_int_t *)src; 830 int v = *(compat_int_t *)src;
841 831
@@ -844,7 +834,7 @@ static void compat_standard_from_user(void *dst, void *src)
844 memcpy(dst, &v, sizeof(v)); 834 memcpy(dst, &v, sizeof(v));
845} 835}
846 836
847static int compat_standard_to_user(void __user *dst, void *src) 837static int compat_standard_to_user(void __user *dst, const void *src)
848{ 838{
849 compat_int_t cv = *(int *)src; 839 compat_int_t cv = *(int *)src;
850 840
@@ -853,18 +843,18 @@ static int compat_standard_to_user(void __user *dst, void *src)
853 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 843 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
854} 844}
855 845
856static int compat_calc_entry(struct arpt_entry *e, 846static int compat_calc_entry(const struct arpt_entry *e,
857 const struct xt_table_info *info, 847 const struct xt_table_info *info,
858 void *base, struct xt_table_info *newinfo) 848 const void *base, struct xt_table_info *newinfo)
859{ 849{
860 struct arpt_entry_target *t; 850 const struct arpt_entry_target *t;
861 unsigned int entry_offset; 851 unsigned int entry_offset;
862 int off, i, ret; 852 int off, i, ret;
863 853
864 off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); 854 off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
865 entry_offset = (void *)e - base; 855 entry_offset = (void *)e - base;
866 856
867 t = arpt_get_target(e); 857 t = arpt_get_target_c(e);
868 off += xt_compat_target_offset(t->u.kernel.target); 858 off += xt_compat_target_offset(t->u.kernel.target);
869 newinfo->size -= off; 859 newinfo->size -= off;
870 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); 860 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
@@ -885,7 +875,9 @@ static int compat_calc_entry(struct arpt_entry *e,
885static int compat_table_info(const struct xt_table_info *info, 875static int compat_table_info(const struct xt_table_info *info,
886 struct xt_table_info *newinfo) 876 struct xt_table_info *newinfo)
887{ 877{
878 struct arpt_entry *iter;
888 void *loc_cpu_entry; 879 void *loc_cpu_entry;
880 int ret;
889 881
890 if (!newinfo || !info) 882 if (!newinfo || !info)
891 return -EINVAL; 883 return -EINVAL;
@@ -894,13 +886,17 @@ static int compat_table_info(const struct xt_table_info *info,
894 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 886 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
895 newinfo->initial_entries = 0; 887 newinfo->initial_entries = 0;
896 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 888 loc_cpu_entry = info->entries[raw_smp_processor_id()];
897 return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, 889 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
898 compat_calc_entry, info, loc_cpu_entry, 890 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
899 newinfo); 891 if (ret != 0)
892 return ret;
893 }
894 return 0;
900} 895}
901#endif 896#endif
902 897
903static int get_info(struct net *net, void __user *user, int *len, int compat) 898static int get_info(struct net *net, void __user *user,
899 const int *len, int compat)
904{ 900{
905 char name[ARPT_TABLE_MAXNAMELEN]; 901 char name[ARPT_TABLE_MAXNAMELEN];
906 struct xt_table *t; 902 struct xt_table *t;
@@ -925,10 +921,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
925 if (t && !IS_ERR(t)) { 921 if (t && !IS_ERR(t)) {
926 struct arpt_getinfo info; 922 struct arpt_getinfo info;
927 const struct xt_table_info *private = t->private; 923 const struct xt_table_info *private = t->private;
928
929#ifdef CONFIG_COMPAT 924#ifdef CONFIG_COMPAT
925 struct xt_table_info tmp;
926
930 if (compat) { 927 if (compat) {
931 struct xt_table_info tmp;
932 ret = compat_table_info(private, &tmp); 928 ret = compat_table_info(private, &tmp);
933 xt_compat_flush_offsets(NFPROTO_ARP); 929 xt_compat_flush_offsets(NFPROTO_ARP);
934 private = &tmp; 930 private = &tmp;
@@ -959,7 +955,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
959} 955}
960 956
961static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, 957static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
962 int *len) 958 const int *len)
963{ 959{
964 int ret; 960 int ret;
965 struct arpt_get_entries get; 961 struct arpt_get_entries get;
@@ -1010,6 +1006,7 @@ static int __do_replace(struct net *net, const char *name,
1010 struct xt_table_info *oldinfo; 1006 struct xt_table_info *oldinfo;
1011 struct xt_counters *counters; 1007 struct xt_counters *counters;
1012 void *loc_cpu_old_entry; 1008 void *loc_cpu_old_entry;
1009 struct arpt_entry *iter;
1013 1010
1014 ret = 0; 1011 ret = 0;
1015 counters = vmalloc_node(num_counters * sizeof(struct xt_counters), 1012 counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
@@ -1053,8 +1050,8 @@ static int __do_replace(struct net *net, const char *name,
1053 1050
1054 /* Decrease module usage counts and free resource */ 1051 /* Decrease module usage counts and free resource */
1055 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1052 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1056 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1053 xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
1057 NULL); 1054 cleanup_entry(iter);
1058 1055
1059 xt_free_table_info(oldinfo); 1056 xt_free_table_info(oldinfo);
1060 if (copy_to_user(counters_ptr, counters, 1057 if (copy_to_user(counters_ptr, counters,
@@ -1073,12 +1070,14 @@ static int __do_replace(struct net *net, const char *name,
1073 return ret; 1070 return ret;
1074} 1071}
1075 1072
1076static int do_replace(struct net *net, void __user *user, unsigned int len) 1073static int do_replace(struct net *net, const void __user *user,
1074 unsigned int len)
1077{ 1075{
1078 int ret; 1076 int ret;
1079 struct arpt_replace tmp; 1077 struct arpt_replace tmp;
1080 struct xt_table_info *newinfo; 1078 struct xt_table_info *newinfo;
1081 void *loc_cpu_entry; 1079 void *loc_cpu_entry;
1080 struct arpt_entry *iter;
1082 1081
1083 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1082 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1084 return -EFAULT; 1083 return -EFAULT;
@@ -1099,9 +1098,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1099 goto free_newinfo; 1098 goto free_newinfo;
1100 } 1099 }
1101 1100
1102 ret = translate_table(tmp.name, tmp.valid_hooks, 1101 ret = translate_table(newinfo, loc_cpu_entry, &tmp);
1103 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1104 tmp.hook_entry, tmp.underflow);
1105 if (ret != 0) 1102 if (ret != 0)
1106 goto free_newinfo; 1103 goto free_newinfo;
1107 1104
@@ -1114,27 +1111,15 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1114 return 0; 1111 return 0;
1115 1112
1116 free_newinfo_untrans: 1113 free_newinfo_untrans:
1117 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1114 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1115 cleanup_entry(iter);
1118 free_newinfo: 1116 free_newinfo:
1119 xt_free_table_info(newinfo); 1117 xt_free_table_info(newinfo);
1120 return ret; 1118 return ret;
1121} 1119}
1122 1120
1123/* We're lazy, and add to the first CPU; overflow works its fey magic 1121static int do_add_counters(struct net *net, const void __user *user,
1124 * and everything is OK. */ 1122 unsigned int len, int compat)
1125static int
1126add_counter_to_entry(struct arpt_entry *e,
1127 const struct xt_counters addme[],
1128 unsigned int *i)
1129{
1130 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1131
1132 (*i)++;
1133 return 0;
1134}
1135
1136static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1137 int compat)
1138{ 1123{
1139 unsigned int i, curcpu; 1124 unsigned int i, curcpu;
1140 struct xt_counters_info tmp; 1125 struct xt_counters_info tmp;
@@ -1147,6 +1132,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1147 const struct xt_table_info *private; 1132 const struct xt_table_info *private;
1148 int ret = 0; 1133 int ret = 0;
1149 void *loc_cpu_entry; 1134 void *loc_cpu_entry;
1135 struct arpt_entry *iter;
1150#ifdef CONFIG_COMPAT 1136#ifdef CONFIG_COMPAT
1151 struct compat_xt_counters_info compat_tmp; 1137 struct compat_xt_counters_info compat_tmp;
1152 1138
@@ -1204,11 +1190,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1204 curcpu = smp_processor_id(); 1190 curcpu = smp_processor_id();
1205 loc_cpu_entry = private->entries[curcpu]; 1191 loc_cpu_entry = private->entries[curcpu];
1206 xt_info_wrlock(curcpu); 1192 xt_info_wrlock(curcpu);
1207 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1193 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1208 private->size, 1194 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1209 add_counter_to_entry, 1195 ++i;
1210 paddc, 1196 }
1211 &i);
1212 xt_info_wrunlock(curcpu); 1197 xt_info_wrunlock(curcpu);
1213 unlock_up_free: 1198 unlock_up_free:
1214 local_bh_enable(); 1199 local_bh_enable();
@@ -1221,28 +1206,22 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1221} 1206}
1222 1207
1223#ifdef CONFIG_COMPAT 1208#ifdef CONFIG_COMPAT
1224static inline int 1209static inline void compat_release_entry(struct compat_arpt_entry *e)
1225compat_release_entry(struct compat_arpt_entry *e, unsigned int *i)
1226{ 1210{
1227 struct arpt_entry_target *t; 1211 struct arpt_entry_target *t;
1228 1212
1229 if (i && (*i)-- == 0)
1230 return 1;
1231
1232 t = compat_arpt_get_target(e); 1213 t = compat_arpt_get_target(e);
1233 module_put(t->u.kernel.target->me); 1214 module_put(t->u.kernel.target->me);
1234 return 0;
1235} 1215}
1236 1216
1237static inline int 1217static inline int
1238check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, 1218check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1239 struct xt_table_info *newinfo, 1219 struct xt_table_info *newinfo,
1240 unsigned int *size, 1220 unsigned int *size,
1241 unsigned char *base, 1221 const unsigned char *base,
1242 unsigned char *limit, 1222 const unsigned char *limit,
1243 unsigned int *hook_entries, 1223 const unsigned int *hook_entries,
1244 unsigned int *underflows, 1224 const unsigned int *underflows,
1245 unsigned int *i,
1246 const char *name) 1225 const char *name)
1247{ 1226{
1248 struct arpt_entry_target *t; 1227 struct arpt_entry_target *t;
@@ -1251,8 +1230,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1251 int ret, off, h; 1230 int ret, off, h;
1252 1231
1253 duprintf("check_compat_entry_size_and_hooks %p\n", e); 1232 duprintf("check_compat_entry_size_and_hooks %p\n", e);
1254 if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 1233 if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
1255 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { 1234 (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
1256 duprintf("Bad offset %p, limit = %p\n", e, limit); 1235 duprintf("Bad offset %p, limit = %p\n", e, limit);
1257 return -EINVAL; 1236 return -EINVAL;
1258 } 1237 }
@@ -1302,8 +1281,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1302 /* Clear counters and comefrom */ 1281 /* Clear counters and comefrom */
1303 memset(&e->counters, 0, sizeof(e->counters)); 1282 memset(&e->counters, 0, sizeof(e->counters));
1304 e->comefrom = 0; 1283 e->comefrom = 0;
1305
1306 (*i)++;
1307 return 0; 1284 return 0;
1308 1285
1309release_target: 1286release_target:
@@ -1347,19 +1324,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1347 return ret; 1324 return ret;
1348} 1325}
1349 1326
1350static inline int compat_check_entry(struct arpt_entry *e, const char *name,
1351 unsigned int *i)
1352{
1353 int ret;
1354
1355 ret = check_target(e, name);
1356 if (ret)
1357 return ret;
1358
1359 (*i)++;
1360 return 0;
1361}
1362
1363static int translate_compat_table(const char *name, 1327static int translate_compat_table(const char *name,
1364 unsigned int valid_hooks, 1328 unsigned int valid_hooks,
1365 struct xt_table_info **pinfo, 1329 struct xt_table_info **pinfo,
@@ -1372,8 +1336,10 @@ static int translate_compat_table(const char *name,
1372 unsigned int i, j; 1336 unsigned int i, j;
1373 struct xt_table_info *newinfo, *info; 1337 struct xt_table_info *newinfo, *info;
1374 void *pos, *entry0, *entry1; 1338 void *pos, *entry0, *entry1;
1339 struct compat_arpt_entry *iter0;
1340 struct arpt_entry *iter1;
1375 unsigned int size; 1341 unsigned int size;
1376 int ret; 1342 int ret = 0;
1377 1343
1378 info = *pinfo; 1344 info = *pinfo;
1379 entry0 = *pentry0; 1345 entry0 = *pentry0;
@@ -1390,13 +1356,17 @@ static int translate_compat_table(const char *name,
1390 j = 0; 1356 j = 0;
1391 xt_compat_lock(NFPROTO_ARP); 1357 xt_compat_lock(NFPROTO_ARP);
1392 /* Walk through entries, checking offsets. */ 1358 /* Walk through entries, checking offsets. */
1393 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1359 xt_entry_foreach(iter0, entry0, total_size) {
1394 check_compat_entry_size_and_hooks, 1360 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
1395 info, &size, entry0, 1361 entry0,
1396 entry0 + total_size, 1362 entry0 + total_size,
1397 hook_entries, underflows, &j, name); 1363 hook_entries,
1398 if (ret != 0) 1364 underflows,
1399 goto out_unlock; 1365 name);
1366 if (ret != 0)
1367 goto out_unlock;
1368 ++j;
1369 }
1400 1370
1401 ret = -EINVAL; 1371 ret = -EINVAL;
1402 if (j != number) { 1372 if (j != number) {
@@ -1435,9 +1405,12 @@ static int translate_compat_table(const char *name,
1435 entry1 = newinfo->entries[raw_smp_processor_id()]; 1405 entry1 = newinfo->entries[raw_smp_processor_id()];
1436 pos = entry1; 1406 pos = entry1;
1437 size = total_size; 1407 size = total_size;
1438 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1408 xt_entry_foreach(iter0, entry0, total_size) {
1439 compat_copy_entry_from_user, 1409 ret = compat_copy_entry_from_user(iter0, &pos, &size,
1440 &pos, &size, name, newinfo, entry1); 1410 name, newinfo, entry1);
1411 if (ret != 0)
1412 break;
1413 }
1441 xt_compat_flush_offsets(NFPROTO_ARP); 1414 xt_compat_flush_offsets(NFPROTO_ARP);
1442 xt_compat_unlock(NFPROTO_ARP); 1415 xt_compat_unlock(NFPROTO_ARP);
1443 if (ret) 1416 if (ret)
@@ -1448,13 +1421,32 @@ static int translate_compat_table(const char *name,
1448 goto free_newinfo; 1421 goto free_newinfo;
1449 1422
1450 i = 0; 1423 i = 0;
1451 ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, 1424 xt_entry_foreach(iter1, entry1, newinfo->size) {
1452 name, &i); 1425 ret = check_target(iter1, name);
1426 if (ret != 0)
1427 break;
1428 ++i;
1429 }
1453 if (ret) { 1430 if (ret) {
1431 /*
1432 * The first i matches need cleanup_entry (calls ->destroy)
1433 * because they had called ->check already. The other j-i
1434 * entries need only release.
1435 */
1436 int skip = i;
1454 j -= i; 1437 j -= i;
1455 COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, 1438 xt_entry_foreach(iter0, entry0, newinfo->size) {
1456 compat_release_entry, &j); 1439 if (skip-- > 0)
1457 ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); 1440 continue;
1441 if (j-- == 0)
1442 break;
1443 compat_release_entry(iter0);
1444 }
1445 xt_entry_foreach(iter1, entry1, newinfo->size) {
1446 if (i-- == 0)
1447 break;
1448 cleanup_entry(iter1);
1449 }
1458 xt_free_table_info(newinfo); 1450 xt_free_table_info(newinfo);
1459 return ret; 1451 return ret;
1460 } 1452 }
@@ -1472,7 +1464,11 @@ static int translate_compat_table(const char *name,
1472free_newinfo: 1464free_newinfo:
1473 xt_free_table_info(newinfo); 1465 xt_free_table_info(newinfo);
1474out: 1466out:
1475 COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1467 xt_entry_foreach(iter0, entry0, total_size) {
1468 if (j-- == 0)
1469 break;
1470 compat_release_entry(iter0);
1471 }
1476 return ret; 1472 return ret;
1477out_unlock: 1473out_unlock:
1478 xt_compat_flush_offsets(NFPROTO_ARP); 1474 xt_compat_flush_offsets(NFPROTO_ARP);
@@ -1499,6 +1495,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1499 struct compat_arpt_replace tmp; 1495 struct compat_arpt_replace tmp;
1500 struct xt_table_info *newinfo; 1496 struct xt_table_info *newinfo;
1501 void *loc_cpu_entry; 1497 void *loc_cpu_entry;
1498 struct arpt_entry *iter;
1502 1499
1503 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1500 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1504 return -EFAULT; 1501 return -EFAULT;
@@ -1536,7 +1533,8 @@ static int compat_do_replace(struct net *net, void __user *user,
1536 return 0; 1533 return 0;
1537 1534
1538 free_newinfo_untrans: 1535 free_newinfo_untrans:
1539 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1536 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1537 cleanup_entry(iter);
1540 free_newinfo: 1538 free_newinfo:
1541 xt_free_table_info(newinfo); 1539 xt_free_table_info(newinfo);
1542 return ret; 1540 return ret;
@@ -1570,7 +1568,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
1570static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, 1568static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1571 compat_uint_t *size, 1569 compat_uint_t *size,
1572 struct xt_counters *counters, 1570 struct xt_counters *counters,
1573 unsigned int *i) 1571 unsigned int i)
1574{ 1572{
1575 struct arpt_entry_target *t; 1573 struct arpt_entry_target *t;
1576 struct compat_arpt_entry __user *ce; 1574 struct compat_arpt_entry __user *ce;
@@ -1578,14 +1576,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1578 compat_uint_t origsize; 1576 compat_uint_t origsize;
1579 int ret; 1577 int ret;
1580 1578
1581 ret = -EFAULT;
1582 origsize = *size; 1579 origsize = *size;
1583 ce = (struct compat_arpt_entry __user *)*dstptr; 1580 ce = (struct compat_arpt_entry __user *)*dstptr;
1584 if (copy_to_user(ce, e, sizeof(struct arpt_entry))) 1581 if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
1585 goto out; 1582 copy_to_user(&ce->counters, &counters[i],
1586 1583 sizeof(counters[i])) != 0)
1587 if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) 1584 return -EFAULT;
1588 goto out;
1589 1585
1590 *dstptr += sizeof(struct compat_arpt_entry); 1586 *dstptr += sizeof(struct compat_arpt_entry);
1591 *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); 1587 *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
@@ -1595,18 +1591,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1595 t = arpt_get_target(e); 1591 t = arpt_get_target(e);
1596 ret = xt_compat_target_to_user(t, dstptr, size); 1592 ret = xt_compat_target_to_user(t, dstptr, size);
1597 if (ret) 1593 if (ret)
1598 goto out; 1594 return ret;
1599 ret = -EFAULT;
1600 next_offset = e->next_offset - (origsize - *size); 1595 next_offset = e->next_offset - (origsize - *size);
1601 if (put_user(target_offset, &ce->target_offset)) 1596 if (put_user(target_offset, &ce->target_offset) != 0 ||
1602 goto out; 1597 put_user(next_offset, &ce->next_offset) != 0)
1603 if (put_user(next_offset, &ce->next_offset)) 1598 return -EFAULT;
1604 goto out;
1605
1606 (*i)++;
1607 return 0; 1599 return 0;
1608out:
1609 return ret;
1610} 1600}
1611 1601
1612static int compat_copy_entries_to_user(unsigned int total_size, 1602static int compat_copy_entries_to_user(unsigned int total_size,
@@ -1620,6 +1610,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1620 int ret = 0; 1610 int ret = 0;
1621 void *loc_cpu_entry; 1611 void *loc_cpu_entry;
1622 unsigned int i = 0; 1612 unsigned int i = 0;
1613 struct arpt_entry *iter;
1623 1614
1624 counters = alloc_counters(table); 1615 counters = alloc_counters(table);
1625 if (IS_ERR(counters)) 1616 if (IS_ERR(counters))
@@ -1629,9 +1620,12 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1629 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1620 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1630 pos = userptr; 1621 pos = userptr;
1631 size = total_size; 1622 size = total_size;
1632 ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, 1623 xt_entry_foreach(iter, loc_cpu_entry, total_size) {
1633 compat_copy_entry_to_user, 1624 ret = compat_copy_entry_to_user(iter, &pos,
1634 &pos, &size, counters, &i); 1625 &size, counters, i++);
1626 if (ret != 0)
1627 break;
1628 }
1635 vfree(counters); 1629 vfree(counters);
1636 return ret; 1630 return ret;
1637} 1631}
@@ -1799,12 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net,
1799 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; 1793 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1800 memcpy(loc_cpu_entry, repl->entries, repl->size); 1794 memcpy(loc_cpu_entry, repl->entries, repl->size);
1801 1795
1802 ret = translate_table(table->name, table->valid_hooks, 1796 ret = translate_table(newinfo, loc_cpu_entry, repl);
1803 newinfo, loc_cpu_entry, repl->size,
1804 repl->num_entries,
1805 repl->hook_entry,
1806 repl->underflow);
1807
1808 duprintf("arpt_register_table: translate table gives %d\n", ret); 1797 duprintf("arpt_register_table: translate table gives %d\n", ret);
1809 if (ret != 0) 1798 if (ret != 0)
1810 goto out_free; 1799 goto out_free;
@@ -1827,13 +1816,14 @@ void arpt_unregister_table(struct xt_table *table)
1827 struct xt_table_info *private; 1816 struct xt_table_info *private;
1828 void *loc_cpu_entry; 1817 void *loc_cpu_entry;
1829 struct module *table_owner = table->me; 1818 struct module *table_owner = table->me;
1819 struct arpt_entry *iter;
1830 1820
1831 private = xt_unregister_table(table); 1821 private = xt_unregister_table(table);
1832 1822
1833 /* Decrease module usage counts and free resources */ 1823 /* Decrease module usage counts and free resources */
1834 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1824 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1835 ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, 1825 xt_entry_foreach(iter, loc_cpu_entry, private->size)
1836 cleanup_entry, NULL); 1826 cleanup_entry(iter);
1837 if (private->number > private->initial_entries) 1827 if (private->number > private->initial_entries)
1838 module_put(table_owner); 1828 module_put(table_owner);
1839 xt_free_table_info(private); 1829 xt_free_table_info(private);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 97337601827a..79ca5e70d497 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -6,7 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/netfilter/x_tables.h>
9#include <linux/netfilter_arp/arp_tables.h> 10#include <linux/netfilter_arp/arp_tables.h>
11#include <linux/slab.h>
10 12
11MODULE_LICENSE("GPL"); 13MODULE_LICENSE("GPL");
12MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); 14MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -15,93 +17,37 @@ MODULE_DESCRIPTION("arptables filter table");
15#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ 17#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
16 (1 << NF_ARP_FORWARD)) 18 (1 << NF_ARP_FORWARD))
17 19
18static const struct
19{
20 struct arpt_replace repl;
21 struct arpt_standard entries[3];
22 struct arpt_error term;
23} initial_table __net_initdata = {
24 .repl = {
25 .name = "filter",
26 .valid_hooks = FILTER_VALID_HOOKS,
27 .num_entries = 4,
28 .size = sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
29 .hook_entry = {
30 [NF_ARP_IN] = 0,
31 [NF_ARP_OUT] = sizeof(struct arpt_standard),
32 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
33 },
34 .underflow = {
35 [NF_ARP_IN] = 0,
36 [NF_ARP_OUT] = sizeof(struct arpt_standard),
37 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
38 },
39 },
40 .entries = {
41 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_IN */
42 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_OUT */
43 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_FORWARD */
44 },
45 .term = ARPT_ERROR_INIT,
46};
47
48static const struct xt_table packet_filter = { 20static const struct xt_table packet_filter = {
49 .name = "filter", 21 .name = "filter",
50 .valid_hooks = FILTER_VALID_HOOKS, 22 .valid_hooks = FILTER_VALID_HOOKS,
51 .me = THIS_MODULE, 23 .me = THIS_MODULE,
52 .af = NFPROTO_ARP, 24 .af = NFPROTO_ARP,
25 .priority = NF_IP_PRI_FILTER,
53}; 26};
54 27
55/* The work comes in here from netfilter.c */ 28/* The work comes in here from netfilter.c */
56static unsigned int arpt_in_hook(unsigned int hook, 29static unsigned int
57 struct sk_buff *skb, 30arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
58 const struct net_device *in, 31 const struct net_device *in, const struct net_device *out,
59 const struct net_device *out, 32 int (*okfn)(struct sk_buff *))
60 int (*okfn)(struct sk_buff *))
61{ 33{
62 return arpt_do_table(skb, hook, in, out, 34 const struct net *net = dev_net((in != NULL) ? in : out);
63 dev_net(in)->ipv4.arptable_filter);
64}
65 35
66static unsigned int arpt_out_hook(unsigned int hook, 36 return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
67 struct sk_buff *skb,
68 const struct net_device *in,
69 const struct net_device *out,
70 int (*okfn)(struct sk_buff *))
71{
72 return arpt_do_table(skb, hook, in, out,
73 dev_net(out)->ipv4.arptable_filter);
74} 37}
75 38
76static struct nf_hook_ops arpt_ops[] __read_mostly = { 39static struct nf_hook_ops *arpfilter_ops __read_mostly;
77 {
78 .hook = arpt_in_hook,
79 .owner = THIS_MODULE,
80 .pf = NFPROTO_ARP,
81 .hooknum = NF_ARP_IN,
82 .priority = NF_IP_PRI_FILTER,
83 },
84 {
85 .hook = arpt_out_hook,
86 .owner = THIS_MODULE,
87 .pf = NFPROTO_ARP,
88 .hooknum = NF_ARP_OUT,
89 .priority = NF_IP_PRI_FILTER,
90 },
91 {
92 .hook = arpt_in_hook,
93 .owner = THIS_MODULE,
94 .pf = NFPROTO_ARP,
95 .hooknum = NF_ARP_FORWARD,
96 .priority = NF_IP_PRI_FILTER,
97 },
98};
99 40
100static int __net_init arptable_filter_net_init(struct net *net) 41static int __net_init arptable_filter_net_init(struct net *net)
101{ 42{
102 /* Register table */ 43 struct arpt_replace *repl;
44
45 repl = arpt_alloc_initial_table(&packet_filter);
46 if (repl == NULL)
47 return -ENOMEM;
103 net->ipv4.arptable_filter = 48 net->ipv4.arptable_filter =
104 arpt_register_table(net, &packet_filter, &initial_table.repl); 49 arpt_register_table(net, &packet_filter, repl);
50 kfree(repl);
105 if (IS_ERR(net->ipv4.arptable_filter)) 51 if (IS_ERR(net->ipv4.arptable_filter))
106 return PTR_ERR(net->ipv4.arptable_filter); 52 return PTR_ERR(net->ipv4.arptable_filter);
107 return 0; 53 return 0;
@@ -125,9 +71,11 @@ static int __init arptable_filter_init(void)
125 if (ret < 0) 71 if (ret < 0)
126 return ret; 72 return ret;
127 73
128 ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); 74 arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
129 if (ret < 0) 75 if (IS_ERR(arpfilter_ops)) {
76 ret = PTR_ERR(arpfilter_ops);
130 goto cleanup_table; 77 goto cleanup_table;
78 }
131 return ret; 79 return ret;
132 80
133cleanup_table: 81cleanup_table:
@@ -137,7 +85,7 @@ cleanup_table:
137 85
138static void __exit arptable_filter_fini(void) 86static void __exit arptable_filter_fini(void)
139{ 87{
140 nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); 88 xt_hook_unlink(&packet_filter, arpfilter_ops);
141 unregister_pernet_subsys(&arptable_filter_net_ops); 89 unregister_pernet_subsys(&arptable_filter_net_ops);
142} 90}
143 91
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c156db215987..e2787048aa0a 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -26,6 +26,7 @@
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/net.h> 27#include <linux/net.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29#include <linux/slab.h>
29#include <net/net_namespace.h> 30#include <net/net_namespace.h>
30#include <net/sock.h> 31#include <net/sock.h>
31#include <net/route.h> 32#include <net/route.h>
@@ -497,10 +498,9 @@ ipq_rcv_nl_event(struct notifier_block *this,
497{ 498{
498 struct netlink_notify *n = ptr; 499 struct netlink_notify *n = ptr;
499 500
500 if (event == NETLINK_URELEASE && 501 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
501 n->protocol == NETLINK_FIREWALL && n->pid) {
502 write_lock_bh(&queue_lock); 502 write_lock_bh(&queue_lock);
503 if ((n->net == &init_net) && (n->pid == peer_pid)) 503 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
504 __ipq_reset(); 504 __ipq_reset();
505 write_unlock_bh(&queue_lock); 505 write_unlock_bh(&queue_lock);
506 } 506 }
@@ -516,14 +516,13 @@ static struct ctl_table_header *ipq_sysctl_header;
516 516
517static ctl_table ipq_table[] = { 517static ctl_table ipq_table[] = {
518 { 518 {
519 .ctl_name = NET_IPQ_QMAX,
520 .procname = NET_IPQ_QMAX_NAME, 519 .procname = NET_IPQ_QMAX_NAME,
521 .data = &queue_maxlen, 520 .data = &queue_maxlen,
522 .maxlen = sizeof(queue_maxlen), 521 .maxlen = sizeof(queue_maxlen),
523 .mode = 0644, 522 .mode = 0644,
524 .proc_handler = proc_dointvec 523 .proc_handler = proc_dointvec
525 }, 524 },
526 { .ctl_name = 0 } 525 { }
527}; 526};
528#endif 527#endif
529 528
@@ -622,7 +621,7 @@ cleanup_netlink_notifier:
622static void __exit ip_queue_fini(void) 621static void __exit ip_queue_fini(void)
623{ 622{
624 nf_unregister_queue_handlers(&nfqh); 623 nf_unregister_queue_handlers(&nfqh);
625 synchronize_net(); 624
626 ipq_flush(NULL, 0); 625 ipq_flush(NULL, 0);
627 626
628#ifdef CONFIG_SYSCTL 627#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cde755d5eeab..b29c66df8d1f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -28,6 +28,7 @@
28#include <linux/netfilter/x_tables.h> 28#include <linux/netfilter/x_tables.h>
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <net/netfilter/nf_log.h> 30#include <net/netfilter/nf_log.h>
31#include "../../netfilter/xt_repldata.h"
31 32
32MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 34MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -66,6 +67,12 @@ do { \
66#define inline 67#define inline
67#endif 68#endif
68 69
70void *ipt_alloc_initial_table(const struct xt_table *info)
71{
72 return xt_alloc_initial_table(ipt, IPT);
73}
74EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
75
69/* 76/*
70 We keep a set of rules for each CPU, so we can avoid write-locking 77 We keep a set of rules for each CPU, so we can avoid write-locking
71 them in the softirq when updating the counters and therefore 78 them in the softirq when updating the counters and therefore
@@ -89,9 +96,9 @@ ip_packet_match(const struct iphdr *ip,
89#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) 96#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
90 97
91 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, 98 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
92 IPT_INV_SRCIP) 99 IPT_INV_SRCIP) ||
93 || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, 100 FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
94 IPT_INV_DSTIP)) { 101 IPT_INV_DSTIP)) {
95 dprintf("Source or dest mismatch.\n"); 102 dprintf("Source or dest mismatch.\n");
96 103
97 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", 104 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
@@ -122,8 +129,8 @@ ip_packet_match(const struct iphdr *ip,
122 } 129 }
123 130
124 /* Check specific protocol */ 131 /* Check specific protocol */
125 if (ipinfo->proto 132 if (ipinfo->proto &&
126 && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { 133 FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
127 dprintf("Packet protocol %hi does not match %hi.%s\n", 134 dprintf("Packet protocol %hi does not match %hi.%s\n",
128 ip->protocol, ipinfo->proto, 135 ip->protocol, ipinfo->proto,
129 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); 136 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
@@ -169,7 +176,7 @@ ipt_error(struct sk_buff *skb, const struct xt_target_param *par)
169 176
170/* Performance critical - called for every packet */ 177/* Performance critical - called for every packet */
171static inline bool 178static inline bool
172do_match(struct ipt_entry_match *m, const struct sk_buff *skb, 179do_match(const struct ipt_entry_match *m, const struct sk_buff *skb,
173 struct xt_match_param *par) 180 struct xt_match_param *par)
174{ 181{
175 par->match = m->u.kernel.match; 182 par->match = m->u.kernel.match;
@@ -184,7 +191,7 @@ do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
184 191
185/* Performance critical */ 192/* Performance critical */
186static inline struct ipt_entry * 193static inline struct ipt_entry *
187get_entry(void *base, unsigned int offset) 194get_entry(const void *base, unsigned int offset)
188{ 195{
189 return (struct ipt_entry *)(base + offset); 196 return (struct ipt_entry *)(base + offset);
190} 197}
@@ -199,6 +206,13 @@ static inline bool unconditional(const struct ipt_ip *ip)
199#undef FWINV 206#undef FWINV
200} 207}
201 208
209/* for const-correctness */
210static inline const struct ipt_entry_target *
211ipt_get_target_c(const struct ipt_entry *e)
212{
213 return ipt_get_target((struct ipt_entry *)e);
214}
215
202#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 216#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
203 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 217 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
204static const char *const hooknames[] = { 218static const char *const hooknames[] = {
@@ -233,11 +247,11 @@ static struct nf_loginfo trace_loginfo = {
233 247
234/* Mildly perf critical (only if packet tracing is on) */ 248/* Mildly perf critical (only if packet tracing is on) */
235static inline int 249static inline int
236get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, 250get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
237 const char *hookname, const char **chainname, 251 const char *hookname, const char **chainname,
238 const char **comment, unsigned int *rulenum) 252 const char **comment, unsigned int *rulenum)
239{ 253{
240 struct ipt_standard_target *t = (void *)ipt_get_target(s); 254 const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
241 255
242 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { 256 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
243 /* Head of user chain: ERROR target with chainname */ 257 /* Head of user chain: ERROR target with chainname */
@@ -246,11 +260,11 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
246 } else if (s == e) { 260 } else if (s == e) {
247 (*rulenum)++; 261 (*rulenum)++;
248 262
249 if (s->target_offset == sizeof(struct ipt_entry) 263 if (s->target_offset == sizeof(struct ipt_entry) &&
250 && strcmp(t->target.u.kernel.target->name, 264 strcmp(t->target.u.kernel.target->name,
251 IPT_STANDARD_TARGET) == 0 265 IPT_STANDARD_TARGET) == 0 &&
252 && t->verdict < 0 266 t->verdict < 0 &&
253 && unconditional(&s->ip)) { 267 unconditional(&s->ip)) {
254 /* Tail of chains: STANDARD target (return/policy) */ 268 /* Tail of chains: STANDARD target (return/policy) */
255 *comment = *chainname == hookname 269 *comment = *chainname == hookname
256 ? comments[NF_IP_TRACE_COMMENT_POLICY] 270 ? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -263,17 +277,18 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
263 return 0; 277 return 0;
264} 278}
265 279
266static void trace_packet(struct sk_buff *skb, 280static void trace_packet(const struct sk_buff *skb,
267 unsigned int hook, 281 unsigned int hook,
268 const struct net_device *in, 282 const struct net_device *in,
269 const struct net_device *out, 283 const struct net_device *out,
270 const char *tablename, 284 const char *tablename,
271 struct xt_table_info *private, 285 const struct xt_table_info *private,
272 struct ipt_entry *e) 286 const struct ipt_entry *e)
273{ 287{
274 void *table_base; 288 const void *table_base;
275 const struct ipt_entry *root; 289 const struct ipt_entry *root;
276 const char *hookname, *chainname, *comment; 290 const char *hookname, *chainname, *comment;
291 const struct ipt_entry *iter;
277 unsigned int rulenum = 0; 292 unsigned int rulenum = 0;
278 293
279 table_base = private->entries[smp_processor_id()]; 294 table_base = private->entries[smp_processor_id()];
@@ -282,10 +297,10 @@ static void trace_packet(struct sk_buff *skb,
282 hookname = chainname = hooknames[hook]; 297 hookname = chainname = hooknames[hook];
283 comment = comments[NF_IP_TRACE_COMMENT_RULE]; 298 comment = comments[NF_IP_TRACE_COMMENT_RULE];
284 299
285 IPT_ENTRY_ITERATE(root, 300 xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
286 private->size - private->hook_entry[hook], 301 if (get_chainname_rulenum(iter, e, hookname,
287 get_chainname_rulenum, 302 &chainname, &comment, &rulenum) != 0)
288 e, hookname, &chainname, &comment, &rulenum); 303 break;
289 304
290 nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, 305 nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
291 "TRACE: %s:%s:%s:%u ", 306 "TRACE: %s:%s:%s:%u ",
@@ -315,9 +330,9 @@ ipt_do_table(struct sk_buff *skb,
315 /* Initializing verdict to NF_DROP keeps gcc happy. */ 330 /* Initializing verdict to NF_DROP keeps gcc happy. */
316 unsigned int verdict = NF_DROP; 331 unsigned int verdict = NF_DROP;
317 const char *indev, *outdev; 332 const char *indev, *outdev;
318 void *table_base; 333 const void *table_base;
319 struct ipt_entry *e, *back; 334 struct ipt_entry *e, *back;
320 struct xt_table_info *private; 335 const struct xt_table_info *private;
321 struct xt_match_param mtpar; 336 struct xt_match_param mtpar;
322 struct xt_target_param tgpar; 337 struct xt_target_param tgpar;
323 338
@@ -350,17 +365,22 @@ ipt_do_table(struct sk_buff *skb,
350 back = get_entry(table_base, private->underflow[hook]); 365 back = get_entry(table_base, private->underflow[hook]);
351 366
352 do { 367 do {
353 struct ipt_entry_target *t; 368 const struct ipt_entry_target *t;
369 const struct xt_entry_match *ematch;
354 370
355 IP_NF_ASSERT(e); 371 IP_NF_ASSERT(e);
356 IP_NF_ASSERT(back); 372 IP_NF_ASSERT(back);
357 if (!ip_packet_match(ip, indev, outdev, 373 if (!ip_packet_match(ip, indev, outdev,
358 &e->ip, mtpar.fragoff) || 374 &e->ip, mtpar.fragoff)) {
359 IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) { 375 no_match:
360 e = ipt_next_entry(e); 376 e = ipt_next_entry(e);
361 continue; 377 continue;
362 } 378 }
363 379
380 xt_ematch_foreach(ematch, e)
381 if (do_match(ematch, skb, &mtpar) != 0)
382 goto no_match;
383
364 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); 384 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
365 385
366 t = ipt_get_target(e); 386 t = ipt_get_target(e);
@@ -388,8 +408,8 @@ ipt_do_table(struct sk_buff *skb,
388 back = get_entry(table_base, back->comefrom); 408 back = get_entry(table_base, back->comefrom);
389 continue; 409 continue;
390 } 410 }
391 if (table_base + v != ipt_next_entry(e) 411 if (table_base + v != ipt_next_entry(e) &&
392 && !(e->ip.flags & IPT_F_GOTO)) { 412 !(e->ip.flags & IPT_F_GOTO)) {
393 /* Save old back ptr in next entry */ 413 /* Save old back ptr in next entry */
394 struct ipt_entry *next = ipt_next_entry(e); 414 struct ipt_entry *next = ipt_next_entry(e);
395 next->comefrom = (void *)back - table_base; 415 next->comefrom = (void *)back - table_base;
@@ -443,7 +463,7 @@ ipt_do_table(struct sk_buff *skb,
443/* Figures out from what hook each rule can be called: returns 0 if 463/* Figures out from what hook each rule can be called: returns 0 if
444 there are loops. Puts hook bitmask in comefrom. */ 464 there are loops. Puts hook bitmask in comefrom. */
445static int 465static int
446mark_source_chains(struct xt_table_info *newinfo, 466mark_source_chains(const struct xt_table_info *newinfo,
447 unsigned int valid_hooks, void *entry0) 467 unsigned int valid_hooks, void *entry0)
448{ 468{
449 unsigned int hook; 469 unsigned int hook;
@@ -461,8 +481,8 @@ mark_source_chains(struct xt_table_info *newinfo,
461 e->counters.pcnt = pos; 481 e->counters.pcnt = pos;
462 482
463 for (;;) { 483 for (;;) {
464 struct ipt_standard_target *t 484 const struct ipt_standard_target *t
465 = (void *)ipt_get_target(e); 485 = (void *)ipt_get_target_c(e);
466 int visited = e->comefrom & (1 << hook); 486 int visited = e->comefrom & (1 << hook);
467 487
468 if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { 488 if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
@@ -473,11 +493,11 @@ mark_source_chains(struct xt_table_info *newinfo,
473 e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); 493 e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
474 494
475 /* Unconditional return/END. */ 495 /* Unconditional return/END. */
476 if ((e->target_offset == sizeof(struct ipt_entry) 496 if ((e->target_offset == sizeof(struct ipt_entry) &&
477 && (strcmp(t->target.u.user.name, 497 (strcmp(t->target.u.user.name,
478 IPT_STANDARD_TARGET) == 0) 498 IPT_STANDARD_TARGET) == 0) &&
479 && t->verdict < 0 499 t->verdict < 0 && unconditional(&e->ip)) ||
480 && unconditional(&e->ip)) || visited) { 500 visited) {
481 unsigned int oldpos, size; 501 unsigned int oldpos, size;
482 502
483 if ((strcmp(t->target.u.user.name, 503 if ((strcmp(t->target.u.user.name,
@@ -524,8 +544,8 @@ mark_source_chains(struct xt_table_info *newinfo,
524 int newpos = t->verdict; 544 int newpos = t->verdict;
525 545
526 if (strcmp(t->target.u.user.name, 546 if (strcmp(t->target.u.user.name,
527 IPT_STANDARD_TARGET) == 0 547 IPT_STANDARD_TARGET) == 0 &&
528 && newpos >= 0) { 548 newpos >= 0) {
529 if (newpos > newinfo->size - 549 if (newpos > newinfo->size -
530 sizeof(struct ipt_entry)) { 550 sizeof(struct ipt_entry)) {
531 duprintf("mark_source_chains: " 551 duprintf("mark_source_chains: "
@@ -552,27 +572,23 @@ mark_source_chains(struct xt_table_info *newinfo,
552 return 1; 572 return 1;
553} 573}
554 574
555static int 575static void cleanup_match(struct ipt_entry_match *m, struct net *net)
556cleanup_match(struct ipt_entry_match *m, unsigned int *i)
557{ 576{
558 struct xt_mtdtor_param par; 577 struct xt_mtdtor_param par;
559 578
560 if (i && (*i)-- == 0) 579 par.net = net;
561 return 1;
562
563 par.match = m->u.kernel.match; 580 par.match = m->u.kernel.match;
564 par.matchinfo = m->data; 581 par.matchinfo = m->data;
565 par.family = NFPROTO_IPV4; 582 par.family = NFPROTO_IPV4;
566 if (par.match->destroy != NULL) 583 if (par.match->destroy != NULL)
567 par.match->destroy(&par); 584 par.match->destroy(&par);
568 module_put(par.match->me); 585 module_put(par.match->me);
569 return 0;
570} 586}
571 587
572static int 588static int
573check_entry(struct ipt_entry *e, const char *name) 589check_entry(const struct ipt_entry *e, const char *name)
574{ 590{
575 struct ipt_entry_target *t; 591 const struct ipt_entry_target *t;
576 592
577 if (!ip_checkentry(&e->ip)) { 593 if (!ip_checkentry(&e->ip)) {
578 duprintf("ip_tables: ip check failed %p %s.\n", e, name); 594 duprintf("ip_tables: ip check failed %p %s.\n", e, name);
@@ -583,7 +599,7 @@ check_entry(struct ipt_entry *e, const char *name)
583 e->next_offset) 599 e->next_offset)
584 return -EINVAL; 600 return -EINVAL;
585 601
586 t = ipt_get_target(e); 602 t = ipt_get_target_c(e);
587 if (e->target_offset + t->u.target_size > e->next_offset) 603 if (e->target_offset + t->u.target_size > e->next_offset)
588 return -EINVAL; 604 return -EINVAL;
589 605
@@ -591,8 +607,7 @@ check_entry(struct ipt_entry *e, const char *name)
591} 607}
592 608
593static int 609static int
594check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, 610check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
595 unsigned int *i)
596{ 611{
597 const struct ipt_ip *ip = par->entryinfo; 612 const struct ipt_ip *ip = par->entryinfo;
598 int ret; 613 int ret;
@@ -607,13 +622,11 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
607 par.match->name); 622 par.match->name);
608 return ret; 623 return ret;
609 } 624 }
610 ++*i;
611 return 0; 625 return 0;
612} 626}
613 627
614static int 628static int
615find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, 629find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
616 unsigned int *i)
617{ 630{
618 struct xt_match *match; 631 struct xt_match *match;
619 int ret; 632 int ret;
@@ -627,7 +640,7 @@ find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
627 } 640 }
628 m->u.kernel.match = match; 641 m->u.kernel.match = match;
629 642
630 ret = check_match(m, par, i); 643 ret = check_match(m, par);
631 if (ret) 644 if (ret)
632 goto err; 645 goto err;
633 646
@@ -637,10 +650,11 @@ err:
637 return ret; 650 return ret;
638} 651}
639 652
640static int check_target(struct ipt_entry *e, const char *name) 653static int check_target(struct ipt_entry *e, struct net *net, const char *name)
641{ 654{
642 struct ipt_entry_target *t = ipt_get_target(e); 655 struct ipt_entry_target *t = ipt_get_target(e);
643 struct xt_tgchk_param par = { 656 struct xt_tgchk_param par = {
657 .net = net,
644 .table = name, 658 .table = name,
645 .entryinfo = e, 659 .entryinfo = e,
646 .target = t->u.kernel.target, 660 .target = t->u.kernel.target,
@@ -661,27 +675,32 @@ static int check_target(struct ipt_entry *e, const char *name)
661} 675}
662 676
663static int 677static int
664find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, 678find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
665 unsigned int *i) 679 unsigned int size)
666{ 680{
667 struct ipt_entry_target *t; 681 struct ipt_entry_target *t;
668 struct xt_target *target; 682 struct xt_target *target;
669 int ret; 683 int ret;
670 unsigned int j; 684 unsigned int j;
671 struct xt_mtchk_param mtpar; 685 struct xt_mtchk_param mtpar;
686 struct xt_entry_match *ematch;
672 687
673 ret = check_entry(e, name); 688 ret = check_entry(e, name);
674 if (ret) 689 if (ret)
675 return ret; 690 return ret;
676 691
677 j = 0; 692 j = 0;
693 mtpar.net = net;
678 mtpar.table = name; 694 mtpar.table = name;
679 mtpar.entryinfo = &e->ip; 695 mtpar.entryinfo = &e->ip;
680 mtpar.hook_mask = e->comefrom; 696 mtpar.hook_mask = e->comefrom;
681 mtpar.family = NFPROTO_IPV4; 697 mtpar.family = NFPROTO_IPV4;
682 ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); 698 xt_ematch_foreach(ematch, e) {
683 if (ret != 0) 699 ret = find_check_match(ematch, &mtpar);
684 goto cleanup_matches; 700 if (ret != 0)
701 goto cleanup_matches;
702 ++j;
703 }
685 704
686 t = ipt_get_target(e); 705 t = ipt_get_target(e);
687 target = try_then_request_module(xt_find_target(AF_INET, 706 target = try_then_request_module(xt_find_target(AF_INET,
@@ -695,27 +714,29 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
695 } 714 }
696 t->u.kernel.target = target; 715 t->u.kernel.target = target;
697 716
698 ret = check_target(e, name); 717 ret = check_target(e, net, name);
699 if (ret) 718 if (ret)
700 goto err; 719 goto err;
701
702 (*i)++;
703 return 0; 720 return 0;
704 err: 721 err:
705 module_put(t->u.kernel.target->me); 722 module_put(t->u.kernel.target->me);
706 cleanup_matches: 723 cleanup_matches:
707 IPT_MATCH_ITERATE(e, cleanup_match, &j); 724 xt_ematch_foreach(ematch, e) {
725 if (j-- == 0)
726 break;
727 cleanup_match(ematch, net);
728 }
708 return ret; 729 return ret;
709} 730}
710 731
711static bool check_underflow(struct ipt_entry *e) 732static bool check_underflow(const struct ipt_entry *e)
712{ 733{
713 const struct ipt_entry_target *t; 734 const struct ipt_entry_target *t;
714 unsigned int verdict; 735 unsigned int verdict;
715 736
716 if (!unconditional(&e->ip)) 737 if (!unconditional(&e->ip))
717 return false; 738 return false;
718 t = ipt_get_target(e); 739 t = ipt_get_target_c(e);
719 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 740 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
720 return false; 741 return false;
721 verdict = ((struct ipt_standard_target *)t)->verdict; 742 verdict = ((struct ipt_standard_target *)t)->verdict;
@@ -726,17 +747,16 @@ static bool check_underflow(struct ipt_entry *e)
726static int 747static int
727check_entry_size_and_hooks(struct ipt_entry *e, 748check_entry_size_and_hooks(struct ipt_entry *e,
728 struct xt_table_info *newinfo, 749 struct xt_table_info *newinfo,
729 unsigned char *base, 750 const unsigned char *base,
730 unsigned char *limit, 751 const unsigned char *limit,
731 const unsigned int *hook_entries, 752 const unsigned int *hook_entries,
732 const unsigned int *underflows, 753 const unsigned int *underflows,
733 unsigned int valid_hooks, 754 unsigned int valid_hooks)
734 unsigned int *i)
735{ 755{
736 unsigned int h; 756 unsigned int h;
737 757
738 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 758 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
739 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { 759 (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
740 duprintf("Bad offset %p\n", e); 760 duprintf("Bad offset %p\n", e);
741 return -EINVAL; 761 return -EINVAL;
742 } 762 }
@@ -768,50 +788,42 @@ check_entry_size_and_hooks(struct ipt_entry *e,
768 /* Clear counters and comefrom */ 788 /* Clear counters and comefrom */
769 e->counters = ((struct xt_counters) { 0, 0 }); 789 e->counters = ((struct xt_counters) { 0, 0 });
770 e->comefrom = 0; 790 e->comefrom = 0;
771
772 (*i)++;
773 return 0; 791 return 0;
774} 792}
775 793
776static int 794static void
777cleanup_entry(struct ipt_entry *e, unsigned int *i) 795cleanup_entry(struct ipt_entry *e, struct net *net)
778{ 796{
779 struct xt_tgdtor_param par; 797 struct xt_tgdtor_param par;
780 struct ipt_entry_target *t; 798 struct ipt_entry_target *t;
781 799 struct xt_entry_match *ematch;
782 if (i && (*i)-- == 0)
783 return 1;
784 800
785 /* Cleanup all matches */ 801 /* Cleanup all matches */
786 IPT_MATCH_ITERATE(e, cleanup_match, NULL); 802 xt_ematch_foreach(ematch, e)
803 cleanup_match(ematch, net);
787 t = ipt_get_target(e); 804 t = ipt_get_target(e);
788 805
806 par.net = net;
789 par.target = t->u.kernel.target; 807 par.target = t->u.kernel.target;
790 par.targinfo = t->data; 808 par.targinfo = t->data;
791 par.family = NFPROTO_IPV4; 809 par.family = NFPROTO_IPV4;
792 if (par.target->destroy != NULL) 810 if (par.target->destroy != NULL)
793 par.target->destroy(&par); 811 par.target->destroy(&par);
794 module_put(par.target->me); 812 module_put(par.target->me);
795 return 0;
796} 813}
797 814
798/* Checks and translates the user-supplied table segment (held in 815/* Checks and translates the user-supplied table segment (held in
799 newinfo) */ 816 newinfo) */
800static int 817static int
801translate_table(const char *name, 818translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
802 unsigned int valid_hooks, 819 const struct ipt_replace *repl)
803 struct xt_table_info *newinfo,
804 void *entry0,
805 unsigned int size,
806 unsigned int number,
807 const unsigned int *hook_entries,
808 const unsigned int *underflows)
809{ 820{
821 struct ipt_entry *iter;
810 unsigned int i; 822 unsigned int i;
811 int ret; 823 int ret = 0;
812 824
813 newinfo->size = size; 825 newinfo->size = repl->size;
814 newinfo->number = number; 826 newinfo->number = repl->num_entries;
815 827
816 /* Init all hooks to impossible value. */ 828 /* Init all hooks to impossible value. */
817 for (i = 0; i < NF_INET_NUMHOOKS; i++) { 829 for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -822,49 +834,58 @@ translate_table(const char *name,
822 duprintf("translate_table: size %u\n", newinfo->size); 834 duprintf("translate_table: size %u\n", newinfo->size);
823 i = 0; 835 i = 0;
824 /* Walk through entries, checking offsets. */ 836 /* Walk through entries, checking offsets. */
825 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, 837 xt_entry_foreach(iter, entry0, newinfo->size) {
826 check_entry_size_and_hooks, 838 ret = check_entry_size_and_hooks(iter, newinfo, entry0,
827 newinfo, 839 entry0 + repl->size,
828 entry0, 840 repl->hook_entry,
829 entry0 + size, 841 repl->underflow,
830 hook_entries, underflows, valid_hooks, &i); 842 repl->valid_hooks);
831 if (ret != 0) 843 if (ret != 0)
832 return ret; 844 return ret;
845 ++i;
846 }
833 847
834 if (i != number) { 848 if (i != repl->num_entries) {
835 duprintf("translate_table: %u not %u entries\n", 849 duprintf("translate_table: %u not %u entries\n",
836 i, number); 850 i, repl->num_entries);
837 return -EINVAL; 851 return -EINVAL;
838 } 852 }
839 853
840 /* Check hooks all assigned */ 854 /* Check hooks all assigned */
841 for (i = 0; i < NF_INET_NUMHOOKS; i++) { 855 for (i = 0; i < NF_INET_NUMHOOKS; i++) {
842 /* Only hooks which are valid */ 856 /* Only hooks which are valid */
843 if (!(valid_hooks & (1 << i))) 857 if (!(repl->valid_hooks & (1 << i)))
844 continue; 858 continue;
845 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { 859 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
846 duprintf("Invalid hook entry %u %u\n", 860 duprintf("Invalid hook entry %u %u\n",
847 i, hook_entries[i]); 861 i, repl->hook_entry[i]);
848 return -EINVAL; 862 return -EINVAL;
849 } 863 }
850 if (newinfo->underflow[i] == 0xFFFFFFFF) { 864 if (newinfo->underflow[i] == 0xFFFFFFFF) {
851 duprintf("Invalid underflow %u %u\n", 865 duprintf("Invalid underflow %u %u\n",
852 i, underflows[i]); 866 i, repl->underflow[i]);
853 return -EINVAL; 867 return -EINVAL;
854 } 868 }
855 } 869 }
856 870
857 if (!mark_source_chains(newinfo, valid_hooks, entry0)) 871 if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
858 return -ELOOP; 872 return -ELOOP;
859 873
860 /* Finally, each sanity check must pass */ 874 /* Finally, each sanity check must pass */
861 i = 0; 875 i = 0;
862 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, 876 xt_entry_foreach(iter, entry0, newinfo->size) {
863 find_check_entry, name, size, &i); 877 ret = find_check_entry(iter, net, repl->name, repl->size);
878 if (ret != 0)
879 break;
880 ++i;
881 }
864 882
865 if (ret != 0) { 883 if (ret != 0) {
866 IPT_ENTRY_ITERATE(entry0, newinfo->size, 884 xt_entry_foreach(iter, entry0, newinfo->size) {
867 cleanup_entry, &i); 885 if (i-- == 0)
886 break;
887 cleanup_entry(iter, net);
888 }
868 return ret; 889 return ret;
869 } 890 }
870 891
@@ -877,33 +898,11 @@ translate_table(const char *name,
877 return ret; 898 return ret;
878} 899}
879 900
880/* Gets counters. */
881static inline int
882add_entry_to_counter(const struct ipt_entry *e,
883 struct xt_counters total[],
884 unsigned int *i)
885{
886 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
887
888 (*i)++;
889 return 0;
890}
891
892static inline int
893set_entry_to_counter(const struct ipt_entry *e,
894 struct ipt_counters total[],
895 unsigned int *i)
896{
897 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
898
899 (*i)++;
900 return 0;
901}
902
903static void 901static void
904get_counters(const struct xt_table_info *t, 902get_counters(const struct xt_table_info *t,
905 struct xt_counters counters[]) 903 struct xt_counters counters[])
906{ 904{
905 struct ipt_entry *iter;
907 unsigned int cpu; 906 unsigned int cpu;
908 unsigned int i; 907 unsigned int i;
909 unsigned int curcpu; 908 unsigned int curcpu;
@@ -919,32 +918,32 @@ get_counters(const struct xt_table_info *t,
919 curcpu = smp_processor_id(); 918 curcpu = smp_processor_id();
920 919
921 i = 0; 920 i = 0;
922 IPT_ENTRY_ITERATE(t->entries[curcpu], 921 xt_entry_foreach(iter, t->entries[curcpu], t->size) {
923 t->size, 922 SET_COUNTER(counters[i], iter->counters.bcnt,
924 set_entry_to_counter, 923 iter->counters.pcnt);
925 counters, 924 ++i;
926 &i); 925 }
927 926
928 for_each_possible_cpu(cpu) { 927 for_each_possible_cpu(cpu) {
929 if (cpu == curcpu) 928 if (cpu == curcpu)
930 continue; 929 continue;
931 i = 0; 930 i = 0;
932 xt_info_wrlock(cpu); 931 xt_info_wrlock(cpu);
933 IPT_ENTRY_ITERATE(t->entries[cpu], 932 xt_entry_foreach(iter, t->entries[cpu], t->size) {
934 t->size, 933 ADD_COUNTER(counters[i], iter->counters.bcnt,
935 add_entry_to_counter, 934 iter->counters.pcnt);
936 counters, 935 ++i; /* macro does multi eval of i */
937 &i); 936 }
938 xt_info_wrunlock(cpu); 937 xt_info_wrunlock(cpu);
939 } 938 }
940 local_bh_enable(); 939 local_bh_enable();
941} 940}
942 941
943static struct xt_counters * alloc_counters(struct xt_table *table) 942static struct xt_counters *alloc_counters(const struct xt_table *table)
944{ 943{
945 unsigned int countersize; 944 unsigned int countersize;
946 struct xt_counters *counters; 945 struct xt_counters *counters;
947 struct xt_table_info *private = table->private; 946 const struct xt_table_info *private = table->private;
948 947
949 /* We need atomic snapshot of counters: rest doesn't change 948 /* We need atomic snapshot of counters: rest doesn't change
950 (other than comefrom, which userspace doesn't care 949 (other than comefrom, which userspace doesn't care
@@ -962,11 +961,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
962 961
963static int 962static int
964copy_entries_to_user(unsigned int total_size, 963copy_entries_to_user(unsigned int total_size,
965 struct xt_table *table, 964 const struct xt_table *table,
966 void __user *userptr) 965 void __user *userptr)
967{ 966{
968 unsigned int off, num; 967 unsigned int off, num;
969 struct ipt_entry *e; 968 const struct ipt_entry *e;
970 struct xt_counters *counters; 969 struct xt_counters *counters;
971 const struct xt_table_info *private = table->private; 970 const struct xt_table_info *private = table->private;
972 int ret = 0; 971 int ret = 0;
@@ -1018,7 +1017,7 @@ copy_entries_to_user(unsigned int total_size,
1018 } 1017 }
1019 } 1018 }
1020 1019
1021 t = ipt_get_target(e); 1020 t = ipt_get_target_c(e);
1022 if (copy_to_user(userptr + off + e->target_offset 1021 if (copy_to_user(userptr + off + e->target_offset
1023 + offsetof(struct ipt_entry_target, 1022 + offsetof(struct ipt_entry_target,
1024 u.user.name), 1023 u.user.name),
@@ -1035,7 +1034,7 @@ copy_entries_to_user(unsigned int total_size,
1035} 1034}
1036 1035
1037#ifdef CONFIG_COMPAT 1036#ifdef CONFIG_COMPAT
1038static void compat_standard_from_user(void *dst, void *src) 1037static void compat_standard_from_user(void *dst, const void *src)
1039{ 1038{
1040 int v = *(compat_int_t *)src; 1039 int v = *(compat_int_t *)src;
1041 1040
@@ -1044,7 +1043,7 @@ static void compat_standard_from_user(void *dst, void *src)
1044 memcpy(dst, &v, sizeof(v)); 1043 memcpy(dst, &v, sizeof(v));
1045} 1044}
1046 1045
1047static int compat_standard_to_user(void __user *dst, void *src) 1046static int compat_standard_to_user(void __user *dst, const void *src)
1048{ 1047{
1049 compat_int_t cv = *(int *)src; 1048 compat_int_t cv = *(int *)src;
1050 1049
@@ -1053,25 +1052,20 @@ static int compat_standard_to_user(void __user *dst, void *src)
1053 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 1052 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
1054} 1053}
1055 1054
1056static inline int 1055static int compat_calc_entry(const struct ipt_entry *e,
1057compat_calc_match(struct ipt_entry_match *m, int *size)
1058{
1059 *size += xt_compat_match_offset(m->u.kernel.match);
1060 return 0;
1061}
1062
1063static int compat_calc_entry(struct ipt_entry *e,
1064 const struct xt_table_info *info, 1056 const struct xt_table_info *info,
1065 void *base, struct xt_table_info *newinfo) 1057 const void *base, struct xt_table_info *newinfo)
1066{ 1058{
1067 struct ipt_entry_target *t; 1059 const struct xt_entry_match *ematch;
1060 const struct ipt_entry_target *t;
1068 unsigned int entry_offset; 1061 unsigned int entry_offset;
1069 int off, i, ret; 1062 int off, i, ret;
1070 1063
1071 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1064 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1072 entry_offset = (void *)e - base; 1065 entry_offset = (void *)e - base;
1073 IPT_MATCH_ITERATE(e, compat_calc_match, &off); 1066 xt_ematch_foreach(ematch, e)
1074 t = ipt_get_target(e); 1067 off += xt_compat_match_offset(ematch->u.kernel.match);
1068 t = ipt_get_target_c(e);
1075 off += xt_compat_target_offset(t->u.kernel.target); 1069 off += xt_compat_target_offset(t->u.kernel.target);
1076 newinfo->size -= off; 1070 newinfo->size -= off;
1077 ret = xt_compat_add_offset(AF_INET, entry_offset, off); 1071 ret = xt_compat_add_offset(AF_INET, entry_offset, off);
@@ -1092,7 +1086,9 @@ static int compat_calc_entry(struct ipt_entry *e,
1092static int compat_table_info(const struct xt_table_info *info, 1086static int compat_table_info(const struct xt_table_info *info,
1093 struct xt_table_info *newinfo) 1087 struct xt_table_info *newinfo)
1094{ 1088{
1089 struct ipt_entry *iter;
1095 void *loc_cpu_entry; 1090 void *loc_cpu_entry;
1091 int ret;
1096 1092
1097 if (!newinfo || !info) 1093 if (!newinfo || !info)
1098 return -EINVAL; 1094 return -EINVAL;
@@ -1101,13 +1097,17 @@ static int compat_table_info(const struct xt_table_info *info,
1101 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1097 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1102 newinfo->initial_entries = 0; 1098 newinfo->initial_entries = 0;
1103 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1099 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1104 return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, 1100 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1105 compat_calc_entry, info, loc_cpu_entry, 1101 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1106 newinfo); 1102 if (ret != 0)
1103 return ret;
1104 }
1105 return 0;
1107} 1106}
1108#endif 1107#endif
1109 1108
1110static int get_info(struct net *net, void __user *user, int *len, int compat) 1109static int get_info(struct net *net, void __user *user,
1110 const int *len, int compat)
1111{ 1111{
1112 char name[IPT_TABLE_MAXNAMELEN]; 1112 char name[IPT_TABLE_MAXNAMELEN];
1113 struct xt_table *t; 1113 struct xt_table *t;
@@ -1132,10 +1132,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1132 if (t && !IS_ERR(t)) { 1132 if (t && !IS_ERR(t)) {
1133 struct ipt_getinfo info; 1133 struct ipt_getinfo info;
1134 const struct xt_table_info *private = t->private; 1134 const struct xt_table_info *private = t->private;
1135
1136#ifdef CONFIG_COMPAT 1135#ifdef CONFIG_COMPAT
1136 struct xt_table_info tmp;
1137
1137 if (compat) { 1138 if (compat) {
1138 struct xt_table_info tmp;
1139 ret = compat_table_info(private, &tmp); 1139 ret = compat_table_info(private, &tmp);
1140 xt_compat_flush_offsets(AF_INET); 1140 xt_compat_flush_offsets(AF_INET);
1141 private = &tmp; 1141 private = &tmp;
@@ -1167,7 +1167,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1167} 1167}
1168 1168
1169static int 1169static int
1170get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) 1170get_entries(struct net *net, struct ipt_get_entries __user *uptr,
1171 const int *len)
1171{ 1172{
1172 int ret; 1173 int ret;
1173 struct ipt_get_entries get; 1174 struct ipt_get_entries get;
@@ -1215,6 +1216,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1215 struct xt_table_info *oldinfo; 1216 struct xt_table_info *oldinfo;
1216 struct xt_counters *counters; 1217 struct xt_counters *counters;
1217 void *loc_cpu_old_entry; 1218 void *loc_cpu_old_entry;
1219 struct ipt_entry *iter;
1218 1220
1219 ret = 0; 1221 ret = 0;
1220 counters = vmalloc(num_counters * sizeof(struct xt_counters)); 1222 counters = vmalloc(num_counters * sizeof(struct xt_counters));
@@ -1257,8 +1259,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1257 1259
1258 /* Decrease module usage counts and free resource */ 1260 /* Decrease module usage counts and free resource */
1259 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1261 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1260 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1262 xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
1261 NULL); 1263 cleanup_entry(iter, net);
1264
1262 xt_free_table_info(oldinfo); 1265 xt_free_table_info(oldinfo);
1263 if (copy_to_user(counters_ptr, counters, 1266 if (copy_to_user(counters_ptr, counters,
1264 sizeof(struct xt_counters) * num_counters) != 0) 1267 sizeof(struct xt_counters) * num_counters) != 0)
@@ -1277,12 +1280,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1277} 1280}
1278 1281
1279static int 1282static int
1280do_replace(struct net *net, void __user *user, unsigned int len) 1283do_replace(struct net *net, const void __user *user, unsigned int len)
1281{ 1284{
1282 int ret; 1285 int ret;
1283 struct ipt_replace tmp; 1286 struct ipt_replace tmp;
1284 struct xt_table_info *newinfo; 1287 struct xt_table_info *newinfo;
1285 void *loc_cpu_entry; 1288 void *loc_cpu_entry;
1289 struct ipt_entry *iter;
1286 1290
1287 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1291 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1288 return -EFAULT; 1292 return -EFAULT;
@@ -1303,9 +1307,7 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1303 goto free_newinfo; 1307 goto free_newinfo;
1304 } 1308 }
1305 1309
1306 ret = translate_table(tmp.name, tmp.valid_hooks, 1310 ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
1307 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1308 tmp.hook_entry, tmp.underflow);
1309 if (ret != 0) 1311 if (ret != 0)
1310 goto free_newinfo; 1312 goto free_newinfo;
1311 1313
@@ -1318,27 +1320,16 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1318 return 0; 1320 return 0;
1319 1321
1320 free_newinfo_untrans: 1322 free_newinfo_untrans:
1321 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1323 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1324 cleanup_entry(iter, net);
1322 free_newinfo: 1325 free_newinfo:
1323 xt_free_table_info(newinfo); 1326 xt_free_table_info(newinfo);
1324 return ret; 1327 return ret;
1325} 1328}
1326 1329
1327/* We're lazy, and add to the first CPU; overflow works its fey magic
1328 * and everything is OK. */
1329static int 1330static int
1330add_counter_to_entry(struct ipt_entry *e, 1331do_add_counters(struct net *net, const void __user *user,
1331 const struct xt_counters addme[], 1332 unsigned int len, int compat)
1332 unsigned int *i)
1333{
1334 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1335
1336 (*i)++;
1337 return 0;
1338}
1339
1340static int
1341do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1342{ 1333{
1343 unsigned int i, curcpu; 1334 unsigned int i, curcpu;
1344 struct xt_counters_info tmp; 1335 struct xt_counters_info tmp;
@@ -1351,6 +1342,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1351 const struct xt_table_info *private; 1342 const struct xt_table_info *private;
1352 int ret = 0; 1343 int ret = 0;
1353 void *loc_cpu_entry; 1344 void *loc_cpu_entry;
1345 struct ipt_entry *iter;
1354#ifdef CONFIG_COMPAT 1346#ifdef CONFIG_COMPAT
1355 struct compat_xt_counters_info compat_tmp; 1347 struct compat_xt_counters_info compat_tmp;
1356 1348
@@ -1408,11 +1400,10 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1408 curcpu = smp_processor_id(); 1400 curcpu = smp_processor_id();
1409 loc_cpu_entry = private->entries[curcpu]; 1401 loc_cpu_entry = private->entries[curcpu];
1410 xt_info_wrlock(curcpu); 1402 xt_info_wrlock(curcpu);
1411 IPT_ENTRY_ITERATE(loc_cpu_entry, 1403 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1412 private->size, 1404 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1413 add_counter_to_entry, 1405 ++i;
1414 paddc, 1406 }
1415 &i);
1416 xt_info_wrunlock(curcpu); 1407 xt_info_wrunlock(curcpu);
1417 unlock_up_free: 1408 unlock_up_free:
1418 local_bh_enable(); 1409 local_bh_enable();
@@ -1440,45 +1431,40 @@ struct compat_ipt_replace {
1440static int 1431static int
1441compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, 1432compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1442 unsigned int *size, struct xt_counters *counters, 1433 unsigned int *size, struct xt_counters *counters,
1443 unsigned int *i) 1434 unsigned int i)
1444{ 1435{
1445 struct ipt_entry_target *t; 1436 struct ipt_entry_target *t;
1446 struct compat_ipt_entry __user *ce; 1437 struct compat_ipt_entry __user *ce;
1447 u_int16_t target_offset, next_offset; 1438 u_int16_t target_offset, next_offset;
1448 compat_uint_t origsize; 1439 compat_uint_t origsize;
1449 int ret; 1440 const struct xt_entry_match *ematch;
1441 int ret = 0;
1450 1442
1451 ret = -EFAULT;
1452 origsize = *size; 1443 origsize = *size;
1453 ce = (struct compat_ipt_entry __user *)*dstptr; 1444 ce = (struct compat_ipt_entry __user *)*dstptr;
1454 if (copy_to_user(ce, e, sizeof(struct ipt_entry))) 1445 if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
1455 goto out; 1446 copy_to_user(&ce->counters, &counters[i],
1456 1447 sizeof(counters[i])) != 0)
1457 if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) 1448 return -EFAULT;
1458 goto out;
1459 1449
1460 *dstptr += sizeof(struct compat_ipt_entry); 1450 *dstptr += sizeof(struct compat_ipt_entry);
1461 *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1451 *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1462 1452
1463 ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); 1453 xt_ematch_foreach(ematch, e) {
1454 ret = xt_compat_match_to_user(ematch, dstptr, size);
1455 if (ret != 0)
1456 return ret;
1457 }
1464 target_offset = e->target_offset - (origsize - *size); 1458 target_offset = e->target_offset - (origsize - *size);
1465 if (ret)
1466 goto out;
1467 t = ipt_get_target(e); 1459 t = ipt_get_target(e);
1468 ret = xt_compat_target_to_user(t, dstptr, size); 1460 ret = xt_compat_target_to_user(t, dstptr, size);
1469 if (ret) 1461 if (ret)
1470 goto out; 1462 return ret;
1471 ret = -EFAULT;
1472 next_offset = e->next_offset - (origsize - *size); 1463 next_offset = e->next_offset - (origsize - *size);
1473 if (put_user(target_offset, &ce->target_offset)) 1464 if (put_user(target_offset, &ce->target_offset) != 0 ||
1474 goto out; 1465 put_user(next_offset, &ce->next_offset) != 0)
1475 if (put_user(next_offset, &ce->next_offset)) 1466 return -EFAULT;
1476 goto out;
1477
1478 (*i)++;
1479 return 0; 1467 return 0;
1480out:
1481 return ret;
1482} 1468}
1483 1469
1484static int 1470static int
@@ -1486,7 +1472,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
1486 const char *name, 1472 const char *name,
1487 const struct ipt_ip *ip, 1473 const struct ipt_ip *ip,
1488 unsigned int hookmask, 1474 unsigned int hookmask,
1489 int *size, unsigned int *i) 1475 int *size)
1490{ 1476{
1491 struct xt_match *match; 1477 struct xt_match *match;
1492 1478
@@ -1500,47 +1486,32 @@ compat_find_calc_match(struct ipt_entry_match *m,
1500 } 1486 }
1501 m->u.kernel.match = match; 1487 m->u.kernel.match = match;
1502 *size += xt_compat_match_offset(match); 1488 *size += xt_compat_match_offset(match);
1503
1504 (*i)++;
1505 return 0;
1506}
1507
1508static int
1509compat_release_match(struct ipt_entry_match *m, unsigned int *i)
1510{
1511 if (i && (*i)-- == 0)
1512 return 1;
1513
1514 module_put(m->u.kernel.match->me);
1515 return 0; 1489 return 0;
1516} 1490}
1517 1491
1518static int 1492static void compat_release_entry(struct compat_ipt_entry *e)
1519compat_release_entry(struct compat_ipt_entry *e, unsigned int *i)
1520{ 1493{
1521 struct ipt_entry_target *t; 1494 struct ipt_entry_target *t;
1522 1495 struct xt_entry_match *ematch;
1523 if (i && (*i)-- == 0)
1524 return 1;
1525 1496
1526 /* Cleanup all matches */ 1497 /* Cleanup all matches */
1527 COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); 1498 xt_ematch_foreach(ematch, e)
1499 module_put(ematch->u.kernel.match->me);
1528 t = compat_ipt_get_target(e); 1500 t = compat_ipt_get_target(e);
1529 module_put(t->u.kernel.target->me); 1501 module_put(t->u.kernel.target->me);
1530 return 0;
1531} 1502}
1532 1503
1533static int 1504static int
1534check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, 1505check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1535 struct xt_table_info *newinfo, 1506 struct xt_table_info *newinfo,
1536 unsigned int *size, 1507 unsigned int *size,
1537 unsigned char *base, 1508 const unsigned char *base,
1538 unsigned char *limit, 1509 const unsigned char *limit,
1539 unsigned int *hook_entries, 1510 const unsigned int *hook_entries,
1540 unsigned int *underflows, 1511 const unsigned int *underflows,
1541 unsigned int *i,
1542 const char *name) 1512 const char *name)
1543{ 1513{
1514 struct xt_entry_match *ematch;
1544 struct ipt_entry_target *t; 1515 struct ipt_entry_target *t;
1545 struct xt_target *target; 1516 struct xt_target *target;
1546 unsigned int entry_offset; 1517 unsigned int entry_offset;
@@ -1548,8 +1519,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1548 int ret, off, h; 1519 int ret, off, h;
1549 1520
1550 duprintf("check_compat_entry_size_and_hooks %p\n", e); 1521 duprintf("check_compat_entry_size_and_hooks %p\n", e);
1551 if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 1522 if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
1552 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { 1523 (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
1553 duprintf("Bad offset %p, limit = %p\n", e, limit); 1524 duprintf("Bad offset %p, limit = %p\n", e, limit);
1554 return -EINVAL; 1525 return -EINVAL;
1555 } 1526 }
@@ -1569,10 +1540,13 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1569 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1540 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1570 entry_offset = (void *)e - (void *)base; 1541 entry_offset = (void *)e - (void *)base;
1571 j = 0; 1542 j = 0;
1572 ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, 1543 xt_ematch_foreach(ematch, e) {
1573 &e->ip, e->comefrom, &off, &j); 1544 ret = compat_find_calc_match(ematch, name,
1574 if (ret != 0) 1545 &e->ip, e->comefrom, &off);
1575 goto release_matches; 1546 if (ret != 0)
1547 goto release_matches;
1548 ++j;
1549 }
1576 1550
1577 t = compat_ipt_get_target(e); 1551 t = compat_ipt_get_target(e);
1578 target = try_then_request_module(xt_find_target(AF_INET, 1552 target = try_then_request_module(xt_find_target(AF_INET,
@@ -1604,14 +1578,16 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1604 /* Clear counters and comefrom */ 1578 /* Clear counters and comefrom */
1605 memset(&e->counters, 0, sizeof(e->counters)); 1579 memset(&e->counters, 0, sizeof(e->counters));
1606 e->comefrom = 0; 1580 e->comefrom = 0;
1607
1608 (*i)++;
1609 return 0; 1581 return 0;
1610 1582
1611out: 1583out:
1612 module_put(t->u.kernel.target->me); 1584 module_put(t->u.kernel.target->me);
1613release_matches: 1585release_matches:
1614 IPT_MATCH_ITERATE(e, compat_release_match, &j); 1586 xt_ematch_foreach(ematch, e) {
1587 if (j-- == 0)
1588 break;
1589 module_put(ematch->u.kernel.match->me);
1590 }
1615 return ret; 1591 return ret;
1616} 1592}
1617 1593
@@ -1625,6 +1601,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1625 struct ipt_entry *de; 1601 struct ipt_entry *de;
1626 unsigned int origsize; 1602 unsigned int origsize;
1627 int ret, h; 1603 int ret, h;
1604 struct xt_entry_match *ematch;
1628 1605
1629 ret = 0; 1606 ret = 0;
1630 origsize = *size; 1607 origsize = *size;
@@ -1635,10 +1612,11 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1635 *dstptr += sizeof(struct ipt_entry); 1612 *dstptr += sizeof(struct ipt_entry);
1636 *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1613 *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1637 1614
1638 ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, 1615 xt_ematch_foreach(ematch, e) {
1639 dstptr, size); 1616 ret = xt_compat_match_from_user(ematch, dstptr, size);
1640 if (ret) 1617 if (ret != 0)
1641 return ret; 1618 return ret;
1619 }
1642 de->target_offset = e->target_offset - (origsize - *size); 1620 de->target_offset = e->target_offset - (origsize - *size);
1643 t = compat_ipt_get_target(e); 1621 t = compat_ipt_get_target(e);
1644 target = t->u.kernel.target; 1622 target = t->u.kernel.target;
@@ -1655,36 +1633,43 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1655} 1633}
1656 1634
1657static int 1635static int
1658compat_check_entry(struct ipt_entry *e, const char *name, 1636compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
1659 unsigned int *i)
1660{ 1637{
1638 struct xt_entry_match *ematch;
1661 struct xt_mtchk_param mtpar; 1639 struct xt_mtchk_param mtpar;
1662 unsigned int j; 1640 unsigned int j;
1663 int ret; 1641 int ret = 0;
1664 1642
1665 j = 0; 1643 j = 0;
1644 mtpar.net = net;
1666 mtpar.table = name; 1645 mtpar.table = name;
1667 mtpar.entryinfo = &e->ip; 1646 mtpar.entryinfo = &e->ip;
1668 mtpar.hook_mask = e->comefrom; 1647 mtpar.hook_mask = e->comefrom;
1669 mtpar.family = NFPROTO_IPV4; 1648 mtpar.family = NFPROTO_IPV4;
1670 ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); 1649 xt_ematch_foreach(ematch, e) {
1671 if (ret) 1650 ret = check_match(ematch, &mtpar);
1672 goto cleanup_matches; 1651 if (ret != 0)
1652 goto cleanup_matches;
1653 ++j;
1654 }
1673 1655
1674 ret = check_target(e, name); 1656 ret = check_target(e, net, name);
1675 if (ret) 1657 if (ret)
1676 goto cleanup_matches; 1658 goto cleanup_matches;
1677
1678 (*i)++;
1679 return 0; 1659 return 0;
1680 1660
1681 cleanup_matches: 1661 cleanup_matches:
1682 IPT_MATCH_ITERATE(e, cleanup_match, &j); 1662 xt_ematch_foreach(ematch, e) {
1663 if (j-- == 0)
1664 break;
1665 cleanup_match(ematch, net);
1666 }
1683 return ret; 1667 return ret;
1684} 1668}
1685 1669
1686static int 1670static int
1687translate_compat_table(const char *name, 1671translate_compat_table(struct net *net,
1672 const char *name,
1688 unsigned int valid_hooks, 1673 unsigned int valid_hooks,
1689 struct xt_table_info **pinfo, 1674 struct xt_table_info **pinfo,
1690 void **pentry0, 1675 void **pentry0,
@@ -1696,6 +1681,8 @@ translate_compat_table(const char *name,
1696 unsigned int i, j; 1681 unsigned int i, j;
1697 struct xt_table_info *newinfo, *info; 1682 struct xt_table_info *newinfo, *info;
1698 void *pos, *entry0, *entry1; 1683 void *pos, *entry0, *entry1;
1684 struct compat_ipt_entry *iter0;
1685 struct ipt_entry *iter1;
1699 unsigned int size; 1686 unsigned int size;
1700 int ret; 1687 int ret;
1701 1688
@@ -1714,13 +1701,17 @@ translate_compat_table(const char *name,
1714 j = 0; 1701 j = 0;
1715 xt_compat_lock(AF_INET); 1702 xt_compat_lock(AF_INET);
1716 /* Walk through entries, checking offsets. */ 1703 /* Walk through entries, checking offsets. */
1717 ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, 1704 xt_entry_foreach(iter0, entry0, total_size) {
1718 check_compat_entry_size_and_hooks, 1705 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
1719 info, &size, entry0, 1706 entry0,
1720 entry0 + total_size, 1707 entry0 + total_size,
1721 hook_entries, underflows, &j, name); 1708 hook_entries,
1722 if (ret != 0) 1709 underflows,
1723 goto out_unlock; 1710 name);
1711 if (ret != 0)
1712 goto out_unlock;
1713 ++j;
1714 }
1724 1715
1725 ret = -EINVAL; 1716 ret = -EINVAL;
1726 if (j != number) { 1717 if (j != number) {
@@ -1759,9 +1750,12 @@ translate_compat_table(const char *name,
1759 entry1 = newinfo->entries[raw_smp_processor_id()]; 1750 entry1 = newinfo->entries[raw_smp_processor_id()];
1760 pos = entry1; 1751 pos = entry1;
1761 size = total_size; 1752 size = total_size;
1762 ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, 1753 xt_entry_foreach(iter0, entry0, total_size) {
1763 compat_copy_entry_from_user, 1754 ret = compat_copy_entry_from_user(iter0, &pos, &size,
1764 &pos, &size, name, newinfo, entry1); 1755 name, newinfo, entry1);
1756 if (ret != 0)
1757 break;
1758 }
1765 xt_compat_flush_offsets(AF_INET); 1759 xt_compat_flush_offsets(AF_INET);
1766 xt_compat_unlock(AF_INET); 1760 xt_compat_unlock(AF_INET);
1767 if (ret) 1761 if (ret)
@@ -1772,13 +1766,32 @@ translate_compat_table(const char *name,
1772 goto free_newinfo; 1766 goto free_newinfo;
1773 1767
1774 i = 0; 1768 i = 0;
1775 ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, 1769 xt_entry_foreach(iter1, entry1, newinfo->size) {
1776 name, &i); 1770 ret = compat_check_entry(iter1, net, name);
1771 if (ret != 0)
1772 break;
1773 ++i;
1774 }
1777 if (ret) { 1775 if (ret) {
1776 /*
1777 * The first i matches need cleanup_entry (calls ->destroy)
1778 * because they had called ->check already. The other j-i
1779 * entries need only release.
1780 */
1781 int skip = i;
1778 j -= i; 1782 j -= i;
1779 COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, 1783 xt_entry_foreach(iter0, entry0, newinfo->size) {
1780 compat_release_entry, &j); 1784 if (skip-- > 0)
1781 IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); 1785 continue;
1786 if (j-- == 0)
1787 break;
1788 compat_release_entry(iter0);
1789 }
1790 xt_entry_foreach(iter1, entry1, newinfo->size) {
1791 if (i-- == 0)
1792 break;
1793 cleanup_entry(iter1, net);
1794 }
1782 xt_free_table_info(newinfo); 1795 xt_free_table_info(newinfo);
1783 return ret; 1796 return ret;
1784 } 1797 }
@@ -1796,7 +1809,11 @@ translate_compat_table(const char *name,
1796free_newinfo: 1809free_newinfo:
1797 xt_free_table_info(newinfo); 1810 xt_free_table_info(newinfo);
1798out: 1811out:
1799 COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1812 xt_entry_foreach(iter0, entry0, total_size) {
1813 if (j-- == 0)
1814 break;
1815 compat_release_entry(iter0);
1816 }
1800 return ret; 1817 return ret;
1801out_unlock: 1818out_unlock:
1802 xt_compat_flush_offsets(AF_INET); 1819 xt_compat_flush_offsets(AF_INET);
@@ -1811,6 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1811 struct compat_ipt_replace tmp; 1828 struct compat_ipt_replace tmp;
1812 struct xt_table_info *newinfo; 1829 struct xt_table_info *newinfo;
1813 void *loc_cpu_entry; 1830 void *loc_cpu_entry;
1831 struct ipt_entry *iter;
1814 1832
1815 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1833 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1816 return -EFAULT; 1834 return -EFAULT;
@@ -1833,7 +1851,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1833 goto free_newinfo; 1851 goto free_newinfo;
1834 } 1852 }
1835 1853
1836 ret = translate_compat_table(tmp.name, tmp.valid_hooks, 1854 ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
1837 &newinfo, &loc_cpu_entry, tmp.size, 1855 &newinfo, &loc_cpu_entry, tmp.size,
1838 tmp.num_entries, tmp.hook_entry, 1856 tmp.num_entries, tmp.hook_entry,
1839 tmp.underflow); 1857 tmp.underflow);
@@ -1849,7 +1867,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1849 return 0; 1867 return 0;
1850 1868
1851 free_newinfo_untrans: 1869 free_newinfo_untrans:
1852 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1870 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1871 cleanup_entry(iter, net);
1853 free_newinfo: 1872 free_newinfo:
1854 xt_free_table_info(newinfo); 1873 xt_free_table_info(newinfo);
1855 return ret; 1874 return ret;
@@ -1898,6 +1917,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
1898 int ret = 0; 1917 int ret = 0;
1899 const void *loc_cpu_entry; 1918 const void *loc_cpu_entry;
1900 unsigned int i = 0; 1919 unsigned int i = 0;
1920 struct ipt_entry *iter;
1901 1921
1902 counters = alloc_counters(table); 1922 counters = alloc_counters(table);
1903 if (IS_ERR(counters)) 1923 if (IS_ERR(counters))
@@ -1910,9 +1930,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
1910 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1930 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1911 pos = userptr; 1931 pos = userptr;
1912 size = total_size; 1932 size = total_size;
1913 ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, 1933 xt_entry_foreach(iter, loc_cpu_entry, total_size) {
1914 compat_copy_entry_to_user, 1934 ret = compat_copy_entry_to_user(iter, &pos,
1915 &pos, &size, counters, &i); 1935 &size, counters, i++);
1936 if (ret != 0)
1937 break;
1938 }
1916 1939
1917 vfree(counters); 1940 vfree(counters);
1918 return ret; 1941 return ret;
@@ -2086,11 +2109,7 @@ struct xt_table *ipt_register_table(struct net *net,
2086 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; 2109 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
2087 memcpy(loc_cpu_entry, repl->entries, repl->size); 2110 memcpy(loc_cpu_entry, repl->entries, repl->size);
2088 2111
2089 ret = translate_table(table->name, table->valid_hooks, 2112 ret = translate_table(net, newinfo, loc_cpu_entry, repl);
2090 newinfo, loc_cpu_entry, repl->size,
2091 repl->num_entries,
2092 repl->hook_entry,
2093 repl->underflow);
2094 if (ret != 0) 2113 if (ret != 0)
2095 goto out_free; 2114 goto out_free;
2096 2115
@@ -2108,17 +2127,19 @@ out:
2108 return ERR_PTR(ret); 2127 return ERR_PTR(ret);
2109} 2128}
2110 2129
2111void ipt_unregister_table(struct xt_table *table) 2130void ipt_unregister_table(struct net *net, struct xt_table *table)
2112{ 2131{
2113 struct xt_table_info *private; 2132 struct xt_table_info *private;
2114 void *loc_cpu_entry; 2133 void *loc_cpu_entry;
2115 struct module *table_owner = table->me; 2134 struct module *table_owner = table->me;
2135 struct ipt_entry *iter;
2116 2136
2117 private = xt_unregister_table(table); 2137 private = xt_unregister_table(table);
2118 2138
2119 /* Decrease module usage counts and free resources */ 2139 /* Decrease module usage counts and free resources */
2120 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 2140 loc_cpu_entry = private->entries[raw_smp_processor_id()];
2121 IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); 2141 xt_entry_foreach(iter, loc_cpu_entry, private->size)
2142 cleanup_entry(iter, net);
2122 if (private->number > private->initial_entries) 2143 if (private->number > private->initial_entries)
2123 module_put(table_owner); 2144 module_put(table_owner);
2124 xt_free_table_info(private); 2145 xt_free_table_info(private);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2e4f98b85524..ab828400ed71 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -14,6 +14,7 @@
14#include <linux/jhash.h> 14#include <linux/jhash.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/slab.h>
17#include <linux/ip.h> 18#include <linux/ip.h>
18#include <linux/tcp.h> 19#include <linux/tcp.h>
19#include <linux/udp.h> 20#include <linux/udp.h>
@@ -303,9 +304,9 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
303 304
304 /* special case: ICMP error handling. conntrack distinguishes between 305 /* special case: ICMP error handling. conntrack distinguishes between
305 * error messages (RELATED) and information requests (see below) */ 306 * error messages (RELATED) and information requests (see below) */
306 if (ip_hdr(skb)->protocol == IPPROTO_ICMP 307 if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
307 && (ctinfo == IP_CT_RELATED 308 (ctinfo == IP_CT_RELATED ||
308 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) 309 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
309 return XT_CONTINUE; 310 return XT_CONTINUE;
310 311
311 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 312 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -362,8 +363,8 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
362 return false; 363 return false;
363 364
364 } 365 }
365 if (e->ip.dmsk.s_addr != htonl(0xffffffff) 366 if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
366 || e->ip.dst.s_addr == 0) { 367 e->ip.dst.s_addr == 0) {
367 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); 368 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
368 return false; 369 return false;
369 } 370 }
@@ -495,14 +496,14 @@ arp_mangle(unsigned int hook,
495 struct clusterip_config *c; 496 struct clusterip_config *c;
496 497
497 /* we don't care about non-ethernet and non-ipv4 ARP */ 498 /* we don't care about non-ethernet and non-ipv4 ARP */
498 if (arp->ar_hrd != htons(ARPHRD_ETHER) 499 if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
499 || arp->ar_pro != htons(ETH_P_IP) 500 arp->ar_pro != htons(ETH_P_IP) ||
500 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 501 arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
501 return NF_ACCEPT; 502 return NF_ACCEPT;
502 503
503 /* we only want to mangle arp requests and replies */ 504 /* we only want to mangle arp requests and replies */
504 if (arp->ar_op != htons(ARPOP_REPLY) 505 if (arp->ar_op != htons(ARPOP_REPLY) &&
505 && arp->ar_op != htons(ARPOP_REQUEST)) 506 arp->ar_op != htons(ARPOP_REQUEST))
506 return NF_ACCEPT; 507 return NF_ACCEPT;
507 508
508 payload = (void *)(arp+1); 509 payload = (void *)(arp+1);
@@ -560,8 +561,7 @@ struct clusterip_seq_position {
560 561
561static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) 562static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
562{ 563{
563 const struct proc_dir_entry *pde = s->private; 564 struct clusterip_config *c = s->private;
564 struct clusterip_config *c = pde->data;
565 unsigned int weight; 565 unsigned int weight;
566 u_int32_t local_nodes; 566 u_int32_t local_nodes;
567 struct clusterip_seq_position *idx; 567 struct clusterip_seq_position *idx;
@@ -632,10 +632,9 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
632 632
633 if (!ret) { 633 if (!ret) {
634 struct seq_file *sf = file->private_data; 634 struct seq_file *sf = file->private_data;
635 struct proc_dir_entry *pde = PDE(inode); 635 struct clusterip_config *c = PDE(inode)->data;
636 struct clusterip_config *c = pde->data;
637 636
638 sf->private = pde; 637 sf->private = c;
639 638
640 clusterip_config_get(c); 639 clusterip_config_get(c);
641 } 640 }
@@ -645,8 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
645 644
646static int clusterip_proc_release(struct inode *inode, struct file *file) 645static int clusterip_proc_release(struct inode *inode, struct file *file)
647{ 646{
648 struct proc_dir_entry *pde = PDE(inode); 647 struct clusterip_config *c = PDE(inode)->data;
649 struct clusterip_config *c = pde->data;
650 int ret; 648 int ret;
651 649
652 ret = seq_release(inode, file); 650 ret = seq_release(inode, file);
@@ -660,10 +658,9 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
660static ssize_t clusterip_proc_write(struct file *file, const char __user *input, 658static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
661 size_t size, loff_t *ofs) 659 size_t size, loff_t *ofs)
662{ 660{
661 struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
663#define PROC_WRITELEN 10 662#define PROC_WRITELEN 10
664 char buffer[PROC_WRITELEN+1]; 663 char buffer[PROC_WRITELEN+1];
665 const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
666 struct clusterip_config *c = pde->data;
667 unsigned long nodenum; 664 unsigned long nodenum;
668 665
669 if (copy_from_user(buffer, input, PROC_WRITELEN)) 666 if (copy_from_user(buffer, input, PROC_WRITELEN))
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index f7e2fa0974dc..ea5cea2415c1 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -50,7 +50,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
50 struct tcphdr _tcph, *tcph; 50 struct tcphdr _tcph, *tcph;
51 __be16 oldval; 51 __be16 oldval;
52 52
53 /* Not enought header? */ 53 /* Not enough header? */
54 tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); 54 tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
55 if (!tcph) 55 if (!tcph)
56 return false; 56 return false;
@@ -85,8 +85,8 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
85 if (!set_ect_ip(skb, einfo)) 85 if (!set_ect_ip(skb, einfo))
86 return NF_DROP; 86 return NF_DROP;
87 87
88 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) 88 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
89 && ip_hdr(skb)->protocol == IPPROTO_TCP) 89 ip_hdr(skb)->protocol == IPPROTO_TCP)
90 if (!set_ect_tcp(skb, einfo)) 90 if (!set_ect_tcp(skb, einfo))
91 return NF_DROP; 91 return NF_DROP;
92 92
@@ -108,8 +108,8 @@ static bool ecn_tg_check(const struct xt_tgchk_param *par)
108 einfo->ip_ect); 108 einfo->ip_ect);
109 return false; 109 return false;
110 } 110 }
111 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) 111 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
112 && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { 112 (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
113 printk(KERN_WARNING "ECN: cannot use TCP operations on a " 113 printk(KERN_WARNING "ECN: cannot use TCP operations on a "
114 "non-tcp rule\n"); 114 "non-tcp rule\n");
115 return false; 115 return false;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index acc44c69eb68..ee128efa1c8d 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -74,8 +74,8 @@ static void dump_packet(const struct nf_loginfo *info,
74 if (ntohs(ih->frag_off) & IP_OFFSET) 74 if (ntohs(ih->frag_off) & IP_OFFSET)
75 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 75 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
76 76
77 if ((logflags & IPT_LOG_IPOPT) 77 if ((logflags & IPT_LOG_IPOPT) &&
78 && ih->ihl * 4 > sizeof(struct iphdr)) { 78 ih->ihl * 4 > sizeof(struct iphdr)) {
79 const unsigned char *op; 79 const unsigned char *op;
80 unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; 80 unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
81 unsigned int i, optsize; 81 unsigned int i, optsize;
@@ -146,8 +146,8 @@ static void dump_packet(const struct nf_loginfo *info,
146 /* Max length: 11 "URGP=65535 " */ 146 /* Max length: 11 "URGP=65535 " */
147 printk("URGP=%u ", ntohs(th->urg_ptr)); 147 printk("URGP=%u ", ntohs(th->urg_ptr));
148 148
149 if ((logflags & IPT_LOG_TCPOPT) 149 if ((logflags & IPT_LOG_TCPOPT) &&
150 && th->doff * 4 > sizeof(struct tcphdr)) { 150 th->doff * 4 > sizeof(struct tcphdr)) {
151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; 151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
152 const unsigned char *op; 152 const unsigned char *op;
153 unsigned int i, optsize; 153 unsigned int i, optsize;
@@ -238,9 +238,9 @@ static void dump_packet(const struct nf_loginfo *info,
238 printk("TYPE=%u CODE=%u ", ich->type, ich->code); 238 printk("TYPE=%u CODE=%u ", ich->type, ich->code);
239 239
240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
241 if (ich->type <= NR_ICMP_TYPES 241 if (ich->type <= NR_ICMP_TYPES &&
242 && required_len[ich->type] 242 required_len[ich->type] &&
243 && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { 243 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
244 printk("INCOMPLETE [%u bytes] ", 244 printk("INCOMPLETE [%u bytes] ",
245 skb->len - iphoff - ih->ihl*4); 245 skb->len - iphoff - ih->ihl*4);
246 break; 246 break;
@@ -276,8 +276,8 @@ static void dump_packet(const struct nf_loginfo *info,
276 } 276 }
277 277
278 /* Max length: 10 "MTU=65535 " */ 278 /* Max length: 10 "MTU=65535 " */
279 if (ich->type == ICMP_DEST_UNREACH 279 if (ich->type == ICMP_DEST_UNREACH &&
280 && ich->code == ICMP_FRAG_NEEDED) 280 ich->code == ICMP_FRAG_NEEDED)
281 printk("MTU=%u ", ntohs(ich->un.frag.mtu)); 281 printk("MTU=%u ", ntohs(ich->un.frag.mtu));
282 } 282 }
283 break; 283 break;
@@ -407,8 +407,8 @@ ipt_log_packet(u_int8_t pf,
407 if (in && !out) { 407 if (in && !out) {
408 /* MAC logging for input chain only. */ 408 /* MAC logging for input chain only. */
409 printk("MAC="); 409 printk("MAC=");
410 if (skb->dev && skb->dev->hard_header_len 410 if (skb->dev && skb->dev->hard_header_len &&
411 && skb->mac_header != skb->network_header) { 411 skb->mac_header != skb->network_header) {
412 int i; 412 int i;
413 const unsigned char *p = skb_mac_header(skb); 413 const unsigned char *p = skb_mac_header(skb);
414 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 414 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index dada0863946d..650b54042b01 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -59,8 +59,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
59 ct = nf_ct_get(skb, &ctinfo); 59 ct = nf_ct_get(skb, &ctinfo);
60 nat = nfct_nat(ct); 60 nat = nfct_nat(ct);
61 61
62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED 62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
63 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 63 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
64 64
65 /* Source address is 0.0.0.0 - locally generated packet that is 65 /* Source address is 0.0.0.0 - locally generated packet that is
66 * probably not supposed to be masqueraded. 66 * probably not supposed to be masqueraded.
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index c93ae44bff2a..a0e8bcf04159 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -12,6 +12,7 @@
12 12
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/slab.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include <linux/udp.h> 17#include <linux/udp.h>
17#include <linux/icmp.h> 18#include <linux/icmp.h>
@@ -184,8 +185,8 @@ static bool reject_tg_check(const struct xt_tgchk_param *par)
184 return false; 185 return false;
185 } else if (rejinfo->with == IPT_TCP_RESET) { 186 } else if (rejinfo->with == IPT_TCP_RESET) {
186 /* Must specify that it's a TCP packet */ 187 /* Must specify that it's a TCP packet */
187 if (e->ip.proto != IPPROTO_TCP 188 if (e->ip.proto != IPPROTO_TCP ||
188 || (e->ip.invflags & XT_INV_PROTO)) { 189 (e->ip.invflags & XT_INV_PROTO)) {
189 printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); 190 printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
190 return false; 191 return false;
191 } 192 }
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d32cc4bb328a..0dbe697f164f 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -33,6 +33,7 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <linux/socket.h> 35#include <linux/socket.h>
36#include <linux/slab.h>
36#include <linux/skbuff.h> 37#include <linux/skbuff.h>
37#include <linux/kernel.h> 38#include <linux/kernel.h>
38#include <linux/timer.h> 39#include <linux/timer.h>
@@ -226,9 +227,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
226 else 227 else
227 *(pm->prefix) = '\0'; 228 *(pm->prefix) = '\0';
228 229
229 if (in && in->hard_header_len > 0 230 if (in && in->hard_header_len > 0 &&
230 && skb->mac_header != skb->network_header 231 skb->mac_header != skb->network_header &&
231 && in->hard_header_len <= ULOG_MAC_LEN) { 232 in->hard_header_len <= ULOG_MAC_LEN) {
232 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); 233 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
233 pm->mac_len = in->hard_header_len; 234 pm->mac_len = in->hard_header_len;
234 } else 235 } else
@@ -338,7 +339,7 @@ struct compat_ipt_ulog_info {
338 char prefix[ULOG_PREFIX_LEN]; 339 char prefix[ULOG_PREFIX_LEN];
339}; 340};
340 341
341static void ulog_tg_compat_from_user(void *dst, void *src) 342static void ulog_tg_compat_from_user(void *dst, const void *src)
342{ 343{
343 const struct compat_ipt_ulog_info *cl = src; 344 const struct compat_ipt_ulog_info *cl = src;
344 struct ipt_ulog_info l = { 345 struct ipt_ulog_info l = {
@@ -351,7 +352,7 @@ static void ulog_tg_compat_from_user(void *dst, void *src)
351 memcpy(dst, &l, sizeof(l)); 352 memcpy(dst, &l, sizeof(l));
352} 353}
353 354
354static int ulog_tg_compat_to_user(void __user *dst, void *src) 355static int ulog_tg_compat_to_user(void __user *dst, const void *src)
355{ 356{
356 const struct ipt_ulog_info *l = src; 357 const struct ipt_ulog_info *l = src;
357 struct compat_ipt_ulog_info cl = { 358 struct compat_ipt_ulog_info cl = {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 6289b64144c6..2a1e56b71908 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -96,8 +96,8 @@ static bool ecn_mt_check(const struct xt_mtchk_param *par)
96 if (info->invert & IPT_ECN_OP_MATCH_MASK) 96 if (info->invert & IPT_ECN_OP_MATCH_MASK)
97 return false; 97 return false;
98 98
99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) 99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
100 && ip->proto != IPPROTO_TCP) { 100 ip->proto != IPPROTO_TCP) {
101 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" 101 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
102 " non-tcp packets\n"); 102 " non-tcp packets\n");
103 return false; 103 return false;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index df566cbd68e5..55392466daa4 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/slab.h>
16#include <net/ip.h> 17#include <net/ip.h>
17 18
18MODULE_LICENSE("GPL"); 19MODULE_LICENSE("GPL");
@@ -23,104 +24,32 @@ MODULE_DESCRIPTION("iptables filter table");
23 (1 << NF_INET_FORWARD) | \ 24 (1 << NF_INET_FORWARD) | \
24 (1 << NF_INET_LOCAL_OUT)) 25 (1 << NF_INET_LOCAL_OUT))
25 26
26static struct
27{
28 struct ipt_replace repl;
29 struct ipt_standard entries[3];
30 struct ipt_error term;
31} initial_table __net_initdata = {
32 .repl = {
33 .name = "filter",
34 .valid_hooks = FILTER_VALID_HOOKS,
35 .num_entries = 4,
36 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
37 .hook_entry = {
38 [NF_INET_LOCAL_IN] = 0,
39 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
40 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
41 },
42 .underflow = {
43 [NF_INET_LOCAL_IN] = 0,
44 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
45 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
46 },
47 },
48 .entries = {
49 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
50 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
51 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
52 },
53 .term = IPT_ERROR_INIT, /* ERROR */
54};
55
56static const struct xt_table packet_filter = { 27static const struct xt_table packet_filter = {
57 .name = "filter", 28 .name = "filter",
58 .valid_hooks = FILTER_VALID_HOOKS, 29 .valid_hooks = FILTER_VALID_HOOKS,
59 .me = THIS_MODULE, 30 .me = THIS_MODULE,
60 .af = NFPROTO_IPV4, 31 .af = NFPROTO_IPV4,
32 .priority = NF_IP_PRI_FILTER,
61}; 33};
62 34
63/* The work comes in here from netfilter.c. */
64static unsigned int
65ipt_local_in_hook(unsigned int hook,
66 struct sk_buff *skb,
67 const struct net_device *in,
68 const struct net_device *out,
69 int (*okfn)(struct sk_buff *))
70{
71 return ipt_do_table(skb, hook, in, out,
72 dev_net(in)->ipv4.iptable_filter);
73}
74
75static unsigned int 35static unsigned int
76ipt_hook(unsigned int hook, 36iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
77 struct sk_buff *skb, 37 const struct net_device *in, const struct net_device *out,
78 const struct net_device *in, 38 int (*okfn)(struct sk_buff *))
79 const struct net_device *out,
80 int (*okfn)(struct sk_buff *))
81{ 39{
82 return ipt_do_table(skb, hook, in, out, 40 const struct net *net;
83 dev_net(in)->ipv4.iptable_filter);
84}
85 41
86static unsigned int 42 if (hook == NF_INET_LOCAL_OUT &&
87ipt_local_out_hook(unsigned int hook, 43 (skb->len < sizeof(struct iphdr) ||
88 struct sk_buff *skb, 44 ip_hdrlen(skb) < sizeof(struct iphdr)))
89 const struct net_device *in, 45 /* root is playing with raw sockets. */
90 const struct net_device *out,
91 int (*okfn)(struct sk_buff *))
92{
93 /* root is playing with raw sockets. */
94 if (skb->len < sizeof(struct iphdr) ||
95 ip_hdrlen(skb) < sizeof(struct iphdr))
96 return NF_ACCEPT; 46 return NF_ACCEPT;
97 return ipt_do_table(skb, hook, in, out, 47
98 dev_net(out)->ipv4.iptable_filter); 48 net = dev_net((in != NULL) ? in : out);
49 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
99} 50}
100 51
101static struct nf_hook_ops ipt_ops[] __read_mostly = { 52static struct nf_hook_ops *filter_ops __read_mostly;
102 {
103 .hook = ipt_local_in_hook,
104 .owner = THIS_MODULE,
105 .pf = NFPROTO_IPV4,
106 .hooknum = NF_INET_LOCAL_IN,
107 .priority = NF_IP_PRI_FILTER,
108 },
109 {
110 .hook = ipt_hook,
111 .owner = THIS_MODULE,
112 .pf = NFPROTO_IPV4,
113 .hooknum = NF_INET_FORWARD,
114 .priority = NF_IP_PRI_FILTER,
115 },
116 {
117 .hook = ipt_local_out_hook,
118 .owner = THIS_MODULE,
119 .pf = NFPROTO_IPV4,
120 .hooknum = NF_INET_LOCAL_OUT,
121 .priority = NF_IP_PRI_FILTER,
122 },
123};
124 53
125/* Default to forward because I got too much mail already. */ 54/* Default to forward because I got too much mail already. */
126static int forward = NF_ACCEPT; 55static int forward = NF_ACCEPT;
@@ -128,9 +57,18 @@ module_param(forward, bool, 0000);
128 57
129static int __net_init iptable_filter_net_init(struct net *net) 58static int __net_init iptable_filter_net_init(struct net *net)
130{ 59{
131 /* Register table */ 60 struct ipt_replace *repl;
61
62 repl = ipt_alloc_initial_table(&packet_filter);
63 if (repl == NULL)
64 return -ENOMEM;
65 /* Entry 1 is the FORWARD hook */
66 ((struct ipt_standard *)repl->entries)[1].target.verdict =
67 -forward - 1;
68
132 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
133 ipt_register_table(net, &packet_filter, &initial_table.repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl);
134 if (IS_ERR(net->ipv4.iptable_filter)) 72 if (IS_ERR(net->ipv4.iptable_filter))
135 return PTR_ERR(net->ipv4.iptable_filter); 73 return PTR_ERR(net->ipv4.iptable_filter);
136 return 0; 74 return 0;
@@ -138,7 +76,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
138 76
139static void __net_exit iptable_filter_net_exit(struct net *net) 77static void __net_exit iptable_filter_net_exit(struct net *net)
140{ 78{
141 ipt_unregister_table(net->ipv4.iptable_filter); 79 ipt_unregister_table(net, net->ipv4.iptable_filter);
142} 80}
143 81
144static struct pernet_operations iptable_filter_net_ops = { 82static struct pernet_operations iptable_filter_net_ops = {
@@ -155,17 +93,16 @@ static int __init iptable_filter_init(void)
155 return -EINVAL; 93 return -EINVAL;
156 } 94 }
157 95
158 /* Entry 1 is the FORWARD hook */
159 initial_table.entries[1].target.verdict = -forward - 1;
160
161 ret = register_pernet_subsys(&iptable_filter_net_ops); 96 ret = register_pernet_subsys(&iptable_filter_net_ops);
162 if (ret < 0) 97 if (ret < 0)
163 return ret; 98 return ret;
164 99
165 /* Register hooks */ 100 /* Register hooks */
166 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 101 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
167 if (ret < 0) 102 if (IS_ERR(filter_ops)) {
103 ret = PTR_ERR(filter_ops);
168 goto cleanup_table; 104 goto cleanup_table;
105 }
169 106
170 return ret; 107 return ret;
171 108
@@ -176,7 +113,7 @@ static int __init iptable_filter_init(void)
176 113
177static void __exit iptable_filter_fini(void) 114static void __exit iptable_filter_fini(void)
178{ 115{
179 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 116 xt_hook_unlink(&packet_filter, filter_ops);
180 unregister_pernet_subsys(&iptable_filter_net_ops); 117 unregister_pernet_subsys(&iptable_filter_net_ops);
181} 118}
182 119
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 036047f9b0f2..294a2a32f293 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -12,6 +12,7 @@
12#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
13#include <linux/netdevice.h> 13#include <linux/netdevice.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/slab.h>
15#include <net/sock.h> 16#include <net/sock.h>
16#include <net/route.h> 17#include <net/route.h>
17#include <linux/ip.h> 18#include <linux/ip.h>
@@ -27,101 +28,16 @@ MODULE_DESCRIPTION("iptables mangle table");
27 (1 << NF_INET_LOCAL_OUT) | \ 28 (1 << NF_INET_LOCAL_OUT) | \
28 (1 << NF_INET_POST_ROUTING)) 29 (1 << NF_INET_POST_ROUTING))
29 30
30/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
31static const struct
32{
33 struct ipt_replace repl;
34 struct ipt_standard entries[5];
35 struct ipt_error term;
36} initial_table __net_initdata = {
37 .repl = {
38 .name = "mangle",
39 .valid_hooks = MANGLE_VALID_HOOKS,
40 .num_entries = 6,
41 .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
42 .hook_entry = {
43 [NF_INET_PRE_ROUTING] = 0,
44 [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
45 [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
46 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
47 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
48 },
49 .underflow = {
50 [NF_INET_PRE_ROUTING] = 0,
51 [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
52 [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
53 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
54 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
55 },
56 },
57 .entries = {
58 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
59 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
60 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
61 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
62 IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
63 },
64 .term = IPT_ERROR_INIT, /* ERROR */
65};
66
67static const struct xt_table packet_mangler = { 31static const struct xt_table packet_mangler = {
68 .name = "mangle", 32 .name = "mangle",
69 .valid_hooks = MANGLE_VALID_HOOKS, 33 .valid_hooks = MANGLE_VALID_HOOKS,
70 .me = THIS_MODULE, 34 .me = THIS_MODULE,
71 .af = NFPROTO_IPV4, 35 .af = NFPROTO_IPV4,
36 .priority = NF_IP_PRI_MANGLE,
72}; 37};
73 38
74/* The work comes in here from netfilter.c. */
75static unsigned int
76ipt_pre_routing_hook(unsigned int hook,
77 struct sk_buff *skb,
78 const struct net_device *in,
79 const struct net_device *out,
80 int (*okfn)(struct sk_buff *))
81{
82 return ipt_do_table(skb, hook, in, out,
83 dev_net(in)->ipv4.iptable_mangle);
84}
85
86static unsigned int
87ipt_post_routing_hook(unsigned int hook,
88 struct sk_buff *skb,
89 const struct net_device *in,
90 const struct net_device *out,
91 int (*okfn)(struct sk_buff *))
92{
93 return ipt_do_table(skb, hook, in, out,
94 dev_net(out)->ipv4.iptable_mangle);
95}
96
97static unsigned int
98ipt_local_in_hook(unsigned int hook,
99 struct sk_buff *skb,
100 const struct net_device *in,
101 const struct net_device *out,
102 int (*okfn)(struct sk_buff *))
103{
104 return ipt_do_table(skb, hook, in, out,
105 dev_net(in)->ipv4.iptable_mangle);
106}
107
108static unsigned int
109ipt_forward_hook(unsigned int hook,
110 struct sk_buff *skb,
111 const struct net_device *in,
112 const struct net_device *out,
113 int (*okfn)(struct sk_buff *))
114{
115 return ipt_do_table(skb, hook, in, out,
116 dev_net(in)->ipv4.iptable_mangle);
117}
118
119static unsigned int 39static unsigned int
120ipt_local_hook(unsigned int hook, 40ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
121 struct sk_buff *skb,
122 const struct net_device *in,
123 const struct net_device *out,
124 int (*okfn)(struct sk_buff *))
125{ 41{
126 unsigned int ret; 42 unsigned int ret;
127 const struct iphdr *iph; 43 const struct iphdr *iph;
@@ -130,8 +46,8 @@ ipt_local_hook(unsigned int hook,
130 u_int32_t mark; 46 u_int32_t mark;
131 47
132 /* root is playing with raw sockets. */ 48 /* root is playing with raw sockets. */
133 if (skb->len < sizeof(struct iphdr) 49 if (skb->len < sizeof(struct iphdr) ||
134 || ip_hdrlen(skb) < sizeof(struct iphdr)) 50 ip_hdrlen(skb) < sizeof(struct iphdr))
135 return NF_ACCEPT; 51 return NF_ACCEPT;
136 52
137 /* Save things which could affect route */ 53 /* Save things which could affect route */
@@ -141,7 +57,7 @@ ipt_local_hook(unsigned int hook,
141 daddr = iph->daddr; 57 daddr = iph->daddr;
142 tos = iph->tos; 58 tos = iph->tos;
143 59
144 ret = ipt_do_table(skb, hook, in, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
145 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
146 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
147 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
@@ -158,49 +74,36 @@ ipt_local_hook(unsigned int hook,
158 return ret; 74 return ret;
159} 75}
160 76
161static struct nf_hook_ops ipt_ops[] __read_mostly = { 77/* The work comes in here from netfilter.c. */
162 { 78static unsigned int
163 .hook = ipt_pre_routing_hook, 79iptable_mangle_hook(unsigned int hook,
164 .owner = THIS_MODULE, 80 struct sk_buff *skb,
165 .pf = NFPROTO_IPV4, 81 const struct net_device *in,
166 .hooknum = NF_INET_PRE_ROUTING, 82 const struct net_device *out,
167 .priority = NF_IP_PRI_MANGLE, 83 int (*okfn)(struct sk_buff *))
168 }, 84{
169 { 85 if (hook == NF_INET_LOCAL_OUT)
170 .hook = ipt_local_in_hook, 86 return ipt_mangle_out(skb, out);
171 .owner = THIS_MODULE, 87 if (hook == NF_INET_POST_ROUTING)
172 .pf = NFPROTO_IPV4, 88 return ipt_do_table(skb, hook, in, out,
173 .hooknum = NF_INET_LOCAL_IN, 89 dev_net(out)->ipv4.iptable_mangle);
174 .priority = NF_IP_PRI_MANGLE, 90 /* PREROUTING/INPUT/FORWARD: */
175 }, 91 return ipt_do_table(skb, hook, in, out,
176 { 92 dev_net(in)->ipv4.iptable_mangle);
177 .hook = ipt_forward_hook, 93}
178 .owner = THIS_MODULE, 94
179 .pf = NFPROTO_IPV4, 95static struct nf_hook_ops *mangle_ops __read_mostly;
180 .hooknum = NF_INET_FORWARD,
181 .priority = NF_IP_PRI_MANGLE,
182 },
183 {
184 .hook = ipt_local_hook,
185 .owner = THIS_MODULE,
186 .pf = NFPROTO_IPV4,
187 .hooknum = NF_INET_LOCAL_OUT,
188 .priority = NF_IP_PRI_MANGLE,
189 },
190 {
191 .hook = ipt_post_routing_hook,
192 .owner = THIS_MODULE,
193 .pf = NFPROTO_IPV4,
194 .hooknum = NF_INET_POST_ROUTING,
195 .priority = NF_IP_PRI_MANGLE,
196 },
197};
198 96
199static int __net_init iptable_mangle_net_init(struct net *net) 97static int __net_init iptable_mangle_net_init(struct net *net)
200{ 98{
201 /* Register table */ 99 struct ipt_replace *repl;
100
101 repl = ipt_alloc_initial_table(&packet_mangler);
102 if (repl == NULL)
103 return -ENOMEM;
202 net->ipv4.iptable_mangle = 104 net->ipv4.iptable_mangle =
203 ipt_register_table(net, &packet_mangler, &initial_table.repl); 105 ipt_register_table(net, &packet_mangler, repl);
106 kfree(repl);
204 if (IS_ERR(net->ipv4.iptable_mangle)) 107 if (IS_ERR(net->ipv4.iptable_mangle))
205 return PTR_ERR(net->ipv4.iptable_mangle); 108 return PTR_ERR(net->ipv4.iptable_mangle);
206 return 0; 109 return 0;
@@ -208,7 +111,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
208 111
209static void __net_exit iptable_mangle_net_exit(struct net *net) 112static void __net_exit iptable_mangle_net_exit(struct net *net)
210{ 113{
211 ipt_unregister_table(net->ipv4.iptable_mangle); 114 ipt_unregister_table(net, net->ipv4.iptable_mangle);
212} 115}
213 116
214static struct pernet_operations iptable_mangle_net_ops = { 117static struct pernet_operations iptable_mangle_net_ops = {
@@ -225,9 +128,11 @@ static int __init iptable_mangle_init(void)
225 return ret; 128 return ret;
226 129
227 /* Register hooks */ 130 /* Register hooks */
228 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 131 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
229 if (ret < 0) 132 if (IS_ERR(mangle_ops)) {
133 ret = PTR_ERR(mangle_ops);
230 goto cleanup_table; 134 goto cleanup_table;
135 }
231 136
232 return ret; 137 return ret;
233 138
@@ -238,7 +143,7 @@ static int __init iptable_mangle_init(void)
238 143
239static void __exit iptable_mangle_fini(void) 144static void __exit iptable_mangle_fini(void)
240{ 145{
241 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 146 xt_hook_unlink(&packet_mangler, mangle_ops);
242 unregister_pernet_subsys(&iptable_mangle_net_ops); 147 unregister_pernet_subsys(&iptable_mangle_net_ops);
243} 148}
244 149
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 993edc23be09..07fb710cd722 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -5,94 +5,49 @@
5 */ 5 */
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/netfilter_ipv4/ip_tables.h> 7#include <linux/netfilter_ipv4/ip_tables.h>
8#include <linux/slab.h>
8#include <net/ip.h> 9#include <net/ip.h>
9 10
10#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) 11#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
11 12
12static const struct
13{
14 struct ipt_replace repl;
15 struct ipt_standard entries[2];
16 struct ipt_error term;
17} initial_table __net_initdata = {
18 .repl = {
19 .name = "raw",
20 .valid_hooks = RAW_VALID_HOOKS,
21 .num_entries = 3,
22 .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
23 .hook_entry = {
24 [NF_INET_PRE_ROUTING] = 0,
25 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
26 },
27 .underflow = {
28 [NF_INET_PRE_ROUTING] = 0,
29 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
30 },
31 },
32 .entries = {
33 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
34 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
35 },
36 .term = IPT_ERROR_INIT, /* ERROR */
37};
38
39static const struct xt_table packet_raw = { 13static const struct xt_table packet_raw = {
40 .name = "raw", 14 .name = "raw",
41 .valid_hooks = RAW_VALID_HOOKS, 15 .valid_hooks = RAW_VALID_HOOKS,
42 .me = THIS_MODULE, 16 .me = THIS_MODULE,
43 .af = NFPROTO_IPV4, 17 .af = NFPROTO_IPV4,
18 .priority = NF_IP_PRI_RAW,
44}; 19};
45 20
46/* The work comes in here from netfilter.c. */ 21/* The work comes in here from netfilter.c. */
47static unsigned int 22static unsigned int
48ipt_hook(unsigned int hook, 23iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
49 struct sk_buff *skb, 24 const struct net_device *in, const struct net_device *out,
50 const struct net_device *in, 25 int (*okfn)(struct sk_buff *))
51 const struct net_device *out,
52 int (*okfn)(struct sk_buff *))
53{ 26{
54 return ipt_do_table(skb, hook, in, out, 27 const struct net *net;
55 dev_net(in)->ipv4.iptable_raw);
56}
57 28
58static unsigned int 29 if (hook == NF_INET_LOCAL_OUT &&
59ipt_local_hook(unsigned int hook, 30 (skb->len < sizeof(struct iphdr) ||
60 struct sk_buff *skb, 31 ip_hdrlen(skb) < sizeof(struct iphdr)))
61 const struct net_device *in, 32 /* root is playing with raw sockets. */
62 const struct net_device *out,
63 int (*okfn)(struct sk_buff *))
64{
65 /* root is playing with raw sockets. */
66 if (skb->len < sizeof(struct iphdr) ||
67 ip_hdrlen(skb) < sizeof(struct iphdr))
68 return NF_ACCEPT; 33 return NF_ACCEPT;
69 return ipt_do_table(skb, hook, in, out, 34
70 dev_net(out)->ipv4.iptable_raw); 35 net = dev_net((in != NULL) ? in : out);
36 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
71} 37}
72 38
73/* 'raw' is the very first table. */ 39static struct nf_hook_ops *rawtable_ops __read_mostly;
74static struct nf_hook_ops ipt_ops[] __read_mostly = {
75 {
76 .hook = ipt_hook,
77 .pf = NFPROTO_IPV4,
78 .hooknum = NF_INET_PRE_ROUTING,
79 .priority = NF_IP_PRI_RAW,
80 .owner = THIS_MODULE,
81 },
82 {
83 .hook = ipt_local_hook,
84 .pf = NFPROTO_IPV4,
85 .hooknum = NF_INET_LOCAL_OUT,
86 .priority = NF_IP_PRI_RAW,
87 .owner = THIS_MODULE,
88 },
89};
90 40
91static int __net_init iptable_raw_net_init(struct net *net) 41static int __net_init iptable_raw_net_init(struct net *net)
92{ 42{
93 /* Register table */ 43 struct ipt_replace *repl;
44
45 repl = ipt_alloc_initial_table(&packet_raw);
46 if (repl == NULL)
47 return -ENOMEM;
94 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
95 ipt_register_table(net, &packet_raw, &initial_table.repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl);
96 if (IS_ERR(net->ipv4.iptable_raw)) 51 if (IS_ERR(net->ipv4.iptable_raw))
97 return PTR_ERR(net->ipv4.iptable_raw); 52 return PTR_ERR(net->ipv4.iptable_raw);
98 return 0; 53 return 0;
@@ -100,7 +55,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
100 55
101static void __net_exit iptable_raw_net_exit(struct net *net) 56static void __net_exit iptable_raw_net_exit(struct net *net)
102{ 57{
103 ipt_unregister_table(net->ipv4.iptable_raw); 58 ipt_unregister_table(net, net->ipv4.iptable_raw);
104} 59}
105 60
106static struct pernet_operations iptable_raw_net_ops = { 61static struct pernet_operations iptable_raw_net_ops = {
@@ -117,9 +72,11 @@ static int __init iptable_raw_init(void)
117 return ret; 72 return ret;
118 73
119 /* Register hooks */ 74 /* Register hooks */
120 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 75 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
121 if (ret < 0) 76 if (IS_ERR(rawtable_ops)) {
77 ret = PTR_ERR(rawtable_ops);
122 goto cleanup_table; 78 goto cleanup_table;
79 }
123 80
124 return ret; 81 return ret;
125 82
@@ -130,7 +87,7 @@ static int __init iptable_raw_init(void)
130 87
131static void __exit iptable_raw_fini(void) 88static void __exit iptable_raw_fini(void)
132{ 89{
133 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 90 xt_hook_unlink(&packet_raw, rawtable_ops);
134 unregister_pernet_subsys(&iptable_raw_net_ops); 91 unregister_pernet_subsys(&iptable_raw_net_ops);
135} 92}
136 93
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 99eb76c65d25..be45bdc4c602 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/netfilter_ipv4/ip_tables.h> 19#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/slab.h>
20#include <net/ip.h> 21#include <net/ip.h>
21 22
22MODULE_LICENSE("GPL"); 23MODULE_LICENSE("GPL");
@@ -27,109 +28,44 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
27 (1 << NF_INET_FORWARD) | \ 28 (1 << NF_INET_FORWARD) | \
28 (1 << NF_INET_LOCAL_OUT) 29 (1 << NF_INET_LOCAL_OUT)
29 30
30static const struct
31{
32 struct ipt_replace repl;
33 struct ipt_standard entries[3];
34 struct ipt_error term;
35} initial_table __net_initdata = {
36 .repl = {
37 .name = "security",
38 .valid_hooks = SECURITY_VALID_HOOKS,
39 .num_entries = 4,
40 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
41 .hook_entry = {
42 [NF_INET_LOCAL_IN] = 0,
43 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
44 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
45 },
46 .underflow = {
47 [NF_INET_LOCAL_IN] = 0,
48 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
49 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
50 },
51 },
52 .entries = {
53 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
54 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
55 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
56 },
57 .term = IPT_ERROR_INIT, /* ERROR */
58};
59
60static const struct xt_table security_table = { 31static const struct xt_table security_table = {
61 .name = "security", 32 .name = "security",
62 .valid_hooks = SECURITY_VALID_HOOKS, 33 .valid_hooks = SECURITY_VALID_HOOKS,
63 .me = THIS_MODULE, 34 .me = THIS_MODULE,
64 .af = NFPROTO_IPV4, 35 .af = NFPROTO_IPV4,
36 .priority = NF_IP_PRI_SECURITY,
65}; 37};
66 38
67static unsigned int 39static unsigned int
68ipt_local_in_hook(unsigned int hook, 40iptable_security_hook(unsigned int hook, struct sk_buff *skb,
69 struct sk_buff *skb, 41 const struct net_device *in,
70 const struct net_device *in, 42 const struct net_device *out,
71 const struct net_device *out, 43 int (*okfn)(struct sk_buff *))
72 int (*okfn)(struct sk_buff *))
73{
74 return ipt_do_table(skb, hook, in, out,
75 dev_net(in)->ipv4.iptable_security);
76}
77
78static unsigned int
79ipt_forward_hook(unsigned int hook,
80 struct sk_buff *skb,
81 const struct net_device *in,
82 const struct net_device *out,
83 int (*okfn)(struct sk_buff *))
84{ 44{
85 return ipt_do_table(skb, hook, in, out, 45 const struct net *net;
86 dev_net(in)->ipv4.iptable_security);
87}
88 46
89static unsigned int 47 if (hook == NF_INET_LOCAL_OUT &&
90ipt_local_out_hook(unsigned int hook, 48 (skb->len < sizeof(struct iphdr) ||
91 struct sk_buff *skb, 49 ip_hdrlen(skb) < sizeof(struct iphdr)))
92 const struct net_device *in, 50 /* Somebody is playing with raw sockets. */
93 const struct net_device *out,
94 int (*okfn)(struct sk_buff *))
95{
96 /* Somebody is playing with raw sockets. */
97 if (skb->len < sizeof(struct iphdr)
98 || ip_hdrlen(skb) < sizeof(struct iphdr))
99 return NF_ACCEPT; 51 return NF_ACCEPT;
100 return ipt_do_table(skb, hook, in, out, 52
101 dev_net(out)->ipv4.iptable_security); 53 net = dev_net((in != NULL) ? in : out);
54 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
102} 55}
103 56
104static struct nf_hook_ops ipt_ops[] __read_mostly = { 57static struct nf_hook_ops *sectbl_ops __read_mostly;
105 {
106 .hook = ipt_local_in_hook,
107 .owner = THIS_MODULE,
108 .pf = NFPROTO_IPV4,
109 .hooknum = NF_INET_LOCAL_IN,
110 .priority = NF_IP_PRI_SECURITY,
111 },
112 {
113 .hook = ipt_forward_hook,
114 .owner = THIS_MODULE,
115 .pf = NFPROTO_IPV4,
116 .hooknum = NF_INET_FORWARD,
117 .priority = NF_IP_PRI_SECURITY,
118 },
119 {
120 .hook = ipt_local_out_hook,
121 .owner = THIS_MODULE,
122 .pf = NFPROTO_IPV4,
123 .hooknum = NF_INET_LOCAL_OUT,
124 .priority = NF_IP_PRI_SECURITY,
125 },
126};
127 58
128static int __net_init iptable_security_net_init(struct net *net) 59static int __net_init iptable_security_net_init(struct net *net)
129{ 60{
130 net->ipv4.iptable_security = 61 struct ipt_replace *repl;
131 ipt_register_table(net, &security_table, &initial_table.repl);
132 62
63 repl = ipt_alloc_initial_table(&security_table);
64 if (repl == NULL)
65 return -ENOMEM;
66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl);
68 kfree(repl);
133 if (IS_ERR(net->ipv4.iptable_security)) 69 if (IS_ERR(net->ipv4.iptable_security))
134 return PTR_ERR(net->ipv4.iptable_security); 70 return PTR_ERR(net->ipv4.iptable_security);
135 71
@@ -138,7 +74,7 @@ static int __net_init iptable_security_net_init(struct net *net)
138 74
139static void __net_exit iptable_security_net_exit(struct net *net) 75static void __net_exit iptable_security_net_exit(struct net *net)
140{ 76{
141 ipt_unregister_table(net->ipv4.iptable_security); 77 ipt_unregister_table(net, net->ipv4.iptable_security);
142} 78}
143 79
144static struct pernet_operations iptable_security_net_ops = { 80static struct pernet_operations iptable_security_net_ops = {
@@ -154,9 +90,11 @@ static int __init iptable_security_init(void)
154 if (ret < 0) 90 if (ret < 0)
155 return ret; 91 return ret;
156 92
157 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 93 sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
158 if (ret < 0) 94 if (IS_ERR(sectbl_ops)) {
95 ret = PTR_ERR(sectbl_ops);
159 goto cleanup_table; 96 goto cleanup_table;
97 }
160 98
161 return ret; 99 return ret;
162 100
@@ -167,7 +105,7 @@ cleanup_table:
167 105
168static void __exit iptable_security_fini(void) 106static void __exit iptable_security_fini(void)
169{ 107{
170 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 108 xt_hook_unlink(&security_table, sectbl_ops);
171 unregister_pernet_subsys(&iptable_security_net_ops); 109 unregister_pernet_subsys(&iptable_security_net_ops);
172} 110}
173 111
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index aa95bb82ee6c..2bb1f87051c4 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -22,6 +22,7 @@
22#include <net/netfilter/nf_conntrack_helper.h> 22#include <net/netfilter/nf_conntrack_helper.h>
23#include <net/netfilter/nf_conntrack_l4proto.h> 23#include <net/netfilter/nf_conntrack_l4proto.h>
24#include <net/netfilter/nf_conntrack_l3proto.h> 24#include <net/netfilter/nf_conntrack_l3proto.h>
25#include <net/netfilter/nf_conntrack_zones.h>
25#include <net/netfilter/nf_conntrack_core.h> 26#include <net/netfilter/nf_conntrack_core.h>
26#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 27#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
27#include <net/netfilter/nf_nat_helper.h> 28#include <net/netfilter/nf_nat_helper.h>
@@ -195,7 +196,6 @@ static int log_invalid_proto_max = 255;
195 196
196static ctl_table ip_ct_sysctl_table[] = { 197static ctl_table ip_ct_sysctl_table[] = {
197 { 198 {
198 .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
199 .procname = "ip_conntrack_max", 199 .procname = "ip_conntrack_max",
200 .data = &nf_conntrack_max, 200 .data = &nf_conntrack_max,
201 .maxlen = sizeof(int), 201 .maxlen = sizeof(int),
@@ -203,7 +203,6 @@ static ctl_table ip_ct_sysctl_table[] = {
203 .proc_handler = proc_dointvec, 203 .proc_handler = proc_dointvec,
204 }, 204 },
205 { 205 {
206 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
207 .procname = "ip_conntrack_count", 206 .procname = "ip_conntrack_count",
208 .data = &init_net.ct.count, 207 .data = &init_net.ct.count,
209 .maxlen = sizeof(int), 208 .maxlen = sizeof(int),
@@ -211,15 +210,13 @@ static ctl_table ip_ct_sysctl_table[] = {
211 .proc_handler = proc_dointvec, 210 .proc_handler = proc_dointvec,
212 }, 211 },
213 { 212 {
214 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
215 .procname = "ip_conntrack_buckets", 213 .procname = "ip_conntrack_buckets",
216 .data = &nf_conntrack_htable_size, 214 .data = &init_net.ct.htable_size,
217 .maxlen = sizeof(unsigned int), 215 .maxlen = sizeof(unsigned int),
218 .mode = 0444, 216 .mode = 0444,
219 .proc_handler = proc_dointvec, 217 .proc_handler = proc_dointvec,
220 }, 218 },
221 { 219 {
222 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
223 .procname = "ip_conntrack_checksum", 220 .procname = "ip_conntrack_checksum",
224 .data = &init_net.ct.sysctl_checksum, 221 .data = &init_net.ct.sysctl_checksum,
225 .maxlen = sizeof(int), 222 .maxlen = sizeof(int),
@@ -227,19 +224,15 @@ static ctl_table ip_ct_sysctl_table[] = {
227 .proc_handler = proc_dointvec, 224 .proc_handler = proc_dointvec,
228 }, 225 },
229 { 226 {
230 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
231 .procname = "ip_conntrack_log_invalid", 227 .procname = "ip_conntrack_log_invalid",
232 .data = &init_net.ct.sysctl_log_invalid, 228 .data = &init_net.ct.sysctl_log_invalid,
233 .maxlen = sizeof(unsigned int), 229 .maxlen = sizeof(unsigned int),
234 .mode = 0644, 230 .mode = 0644,
235 .proc_handler = proc_dointvec_minmax, 231 .proc_handler = proc_dointvec_minmax,
236 .strategy = sysctl_intvec,
237 .extra1 = &log_invalid_proto_min, 232 .extra1 = &log_invalid_proto_min,
238 .extra2 = &log_invalid_proto_max, 233 .extra2 = &log_invalid_proto_max,
239 }, 234 },
240 { 235 { }
241 .ctl_name = 0
242 }
243}; 236};
244#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ 237#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
245 238
@@ -255,10 +248,10 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
255 struct nf_conntrack_tuple tuple; 248 struct nf_conntrack_tuple tuple;
256 249
257 memset(&tuple, 0, sizeof(tuple)); 250 memset(&tuple, 0, sizeof(tuple));
258 tuple.src.u3.ip = inet->rcv_saddr; 251 tuple.src.u3.ip = inet->inet_rcv_saddr;
259 tuple.src.u.tcp.port = inet->sport; 252 tuple.src.u.tcp.port = inet->inet_sport;
260 tuple.dst.u3.ip = inet->daddr; 253 tuple.dst.u3.ip = inet->inet_daddr;
261 tuple.dst.u.tcp.port = inet->dport; 254 tuple.dst.u.tcp.port = inet->inet_dport;
262 tuple.src.l3num = PF_INET; 255 tuple.src.l3num = PF_INET;
263 tuple.dst.protonum = sk->sk_protocol; 256 tuple.dst.protonum = sk->sk_protocol;
264 257
@@ -274,7 +267,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
274 return -EINVAL; 267 return -EINVAL;
275 } 268 }
276 269
277 h = nf_conntrack_find_get(sock_net(sk), &tuple); 270 h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
278 if (h) { 271 if (h) {
279 struct sockaddr_in sin; 272 struct sockaddr_in sin;
280 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 273 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 8668a3defda6..2fb7b76da94f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
32 struct hlist_nulls_node *n; 32 struct hlist_nulls_node *n;
33 33
34 for (st->bucket = 0; 34 for (st->bucket = 0;
35 st->bucket < nf_conntrack_htable_size; 35 st->bucket < net->ct.htable_size;
36 st->bucket++) { 36 st->bucket++) {
37 n = rcu_dereference(net->ct.hash[st->bucket].first); 37 n = rcu_dereference(net->ct.hash[st->bucket].first);
38 if (!is_a_nulls(n)) 38 if (!is_a_nulls(n))
@@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
50 head = rcu_dereference(head->next); 50 head = rcu_dereference(head->next);
51 while (is_a_nulls(head)) { 51 while (is_a_nulls(head)) {
52 if (likely(get_nulls_value(head) == st->bucket)) { 52 if (likely(get_nulls_value(head) == st->bucket)) {
53 if (++st->bucket >= nf_conntrack_htable_size) 53 if (++st->bucket >= net->ct.htable_size)
54 return NULL; 54 return NULL;
55 } 55 }
56 head = rcu_dereference(net->ct.hash[st->bucket].first); 56 head = rcu_dereference(net->ct.hash[st->bucket].first);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d71ba7677344..7404bde95994 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -18,6 +18,7 @@
18#include <net/netfilter/nf_conntrack_tuple.h> 18#include <net/netfilter/nf_conntrack_tuple.h>
19#include <net/netfilter/nf_conntrack_l4proto.h> 19#include <net/netfilter/nf_conntrack_l4proto.h>
20#include <net/netfilter/nf_conntrack_core.h> 20#include <net/netfilter/nf_conntrack_core.h>
21#include <net/netfilter/nf_conntrack_zones.h>
21#include <net/netfilter/nf_log.h> 22#include <net/netfilter/nf_log.h>
22 23
23static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; 24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
@@ -54,8 +55,8 @@ static const u_int8_t invmap[] = {
54static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, 55static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
55 const struct nf_conntrack_tuple *orig) 56 const struct nf_conntrack_tuple *orig)
56{ 57{
57 if (orig->dst.u.icmp.type >= sizeof(invmap) 58 if (orig->dst.u.icmp.type >= sizeof(invmap) ||
58 || !invmap[orig->dst.u.icmp.type]) 59 !invmap[orig->dst.u.icmp.type])
59 return false; 60 return false;
60 61
61 tuple->src.u.icmp.id = orig->src.u.icmp.id; 62 tuple->src.u.icmp.id = orig->src.u.icmp.id;
@@ -101,8 +102,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
101 [ICMP_ADDRESS] = 1 102 [ICMP_ADDRESS] = 1
102 }; 103 };
103 104
104 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) 105 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
105 || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { 106 !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
106 /* Can't create a new ICMP `conn' with this. */ 107 /* Can't create a new ICMP `conn' with this. */
107 pr_debug("icmp: can't create new conn with type %u\n", 108 pr_debug("icmp: can't create new conn with type %u\n",
108 ct->tuplehash[0].tuple.dst.u.icmp.type); 109 ct->tuplehash[0].tuple.dst.u.icmp.type);
@@ -114,13 +115,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
114 115
115/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 116/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
116static int 117static int
117icmp_error_message(struct net *net, struct sk_buff *skb, 118icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
118 enum ip_conntrack_info *ctinfo, 119 enum ip_conntrack_info *ctinfo,
119 unsigned int hooknum) 120 unsigned int hooknum)
120{ 121{
121 struct nf_conntrack_tuple innertuple, origtuple; 122 struct nf_conntrack_tuple innertuple, origtuple;
122 const struct nf_conntrack_l4proto *innerproto; 123 const struct nf_conntrack_l4proto *innerproto;
123 const struct nf_conntrack_tuple_hash *h; 124 const struct nf_conntrack_tuple_hash *h;
125 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
124 126
125 NF_CT_ASSERT(skb->nfct == NULL); 127 NF_CT_ASSERT(skb->nfct == NULL);
126 128
@@ -146,7 +148,7 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
146 148
147 *ctinfo = IP_CT_RELATED; 149 *ctinfo = IP_CT_RELATED;
148 150
149 h = nf_conntrack_find_get(net, &innertuple); 151 h = nf_conntrack_find_get(net, zone, &innertuple);
150 if (!h) { 152 if (!h) {
151 pr_debug("icmp_error_message: no match\n"); 153 pr_debug("icmp_error_message: no match\n");
152 return -NF_ACCEPT; 154 return -NF_ACCEPT;
@@ -163,7 +165,8 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
163 165
164/* Small and modified version of icmp_rcv */ 166/* Small and modified version of icmp_rcv */
165static int 167static int
166icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, 168icmp_error(struct net *net, struct nf_conn *tmpl,
169 struct sk_buff *skb, unsigned int dataoff,
167 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 170 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
168{ 171{
169 const struct icmphdr *icmph; 172 const struct icmphdr *icmph;
@@ -201,14 +204,14 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
201 } 204 }
202 205
203 /* Need to track icmp error message? */ 206 /* Need to track icmp error message? */
204 if (icmph->type != ICMP_DEST_UNREACH 207 if (icmph->type != ICMP_DEST_UNREACH &&
205 && icmph->type != ICMP_SOURCE_QUENCH 208 icmph->type != ICMP_SOURCE_QUENCH &&
206 && icmph->type != ICMP_TIME_EXCEEDED 209 icmph->type != ICMP_TIME_EXCEEDED &&
207 && icmph->type != ICMP_PARAMETERPROB 210 icmph->type != ICMP_PARAMETERPROB &&
208 && icmph->type != ICMP_REDIRECT) 211 icmph->type != ICMP_REDIRECT)
209 return NF_ACCEPT; 212 return NF_ACCEPT;
210 213
211 return icmp_error_message(net, skb, ctinfo, hooknum); 214 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
212} 215}
213 216
214#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 217#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
@@ -238,17 +241,17 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
238static int icmp_nlattr_to_tuple(struct nlattr *tb[], 241static int icmp_nlattr_to_tuple(struct nlattr *tb[],
239 struct nf_conntrack_tuple *tuple) 242 struct nf_conntrack_tuple *tuple)
240{ 243{
241 if (!tb[CTA_PROTO_ICMP_TYPE] 244 if (!tb[CTA_PROTO_ICMP_TYPE] ||
242 || !tb[CTA_PROTO_ICMP_CODE] 245 !tb[CTA_PROTO_ICMP_CODE] ||
243 || !tb[CTA_PROTO_ICMP_ID]) 246 !tb[CTA_PROTO_ICMP_ID])
244 return -EINVAL; 247 return -EINVAL;
245 248
246 tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); 249 tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
247 tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); 250 tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
248 tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); 251 tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
249 252
250 if (tuple->dst.u.icmp.type >= sizeof(invmap) 253 if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
251 || !invmap[tuple->dst.u.icmp.type]) 254 !invmap[tuple->dst.u.icmp.type])
252 return -EINVAL; 255 return -EINVAL;
253 256
254 return 0; 257 return 0;
@@ -270,9 +273,7 @@ static struct ctl_table icmp_sysctl_table[] = {
270 .mode = 0644, 273 .mode = 0644,
271 .proc_handler = proc_dointvec_jiffies, 274 .proc_handler = proc_dointvec_jiffies,
272 }, 275 },
273 { 276 { }
274 .ctl_name = 0
275 }
276}; 277};
277#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT 278#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
278static struct ctl_table icmp_compat_sysctl_table[] = { 279static struct ctl_table icmp_compat_sysctl_table[] = {
@@ -283,9 +284,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
283 .mode = 0644, 284 .mode = 0644,
284 .proc_handler = proc_dointvec_jiffies, 285 .proc_handler = proc_dointvec_jiffies,
285 }, 286 },
286 { 287 { }
287 .ctl_name = 0
288 }
289}; 288};
290#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ 289#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
291#endif /* CONFIG_SYSCTL */ 290#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index fa2d6b6fc3e5..cb763ae9ed90 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -14,8 +14,13 @@
14#include <net/route.h> 14#include <net/route.h>
15#include <net/ip.h> 15#include <net/ip.h>
16 16
17#include <linux/netfilter_bridge.h>
17#include <linux/netfilter_ipv4.h> 18#include <linux/netfilter_ipv4.h>
18#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 19#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
20#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
21#include <net/netfilter/nf_conntrack.h>
22#endif
23#include <net/netfilter/nf_conntrack_zones.h>
19 24
20/* Returns new sk_buff, or NULL */ 25/* Returns new sk_buff, or NULL */
21static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) 26static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -34,6 +39,27 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
34 return err; 39 return err;
35} 40}
36 41
42static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
43 struct sk_buff *skb)
44{
45 u16 zone = NF_CT_DEFAULT_ZONE;
46
47#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
48 if (skb->nfct)
49 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
50#endif
51
52#ifdef CONFIG_BRIDGE_NETFILTER
53 if (skb->nf_bridge &&
54 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
55 return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
56#endif
57 if (hooknum == NF_INET_PRE_ROUTING)
58 return IP_DEFRAG_CONNTRACK_IN + zone;
59 else
60 return IP_DEFRAG_CONNTRACK_OUT + zone;
61}
62
37static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, 63static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
38 struct sk_buff *skb, 64 struct sk_buff *skb,
39 const struct net_device *in, 65 const struct net_device *in,
@@ -44,16 +70,14 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
44#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) 70#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
45 /* Previously seen (loopback)? Ignore. Do this before 71 /* Previously seen (loopback)? Ignore. Do this before
46 fragment check. */ 72 fragment check. */
47 if (skb->nfct) 73 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
48 return NF_ACCEPT; 74 return NF_ACCEPT;
49#endif 75#endif
50#endif 76#endif
51 /* Gather fragments. */ 77 /* Gather fragments. */
52 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 78 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
53 if (nf_ct_ipv4_gather_frags(skb, 79 enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
54 hooknum == NF_INET_PRE_ROUTING ? 80 if (nf_ct_ipv4_gather_frags(skb, user))
55 IP_DEFRAG_CONNTRACK_IN :
56 IP_DEFRAG_CONNTRACK_OUT))
57 return NF_STOLEN; 81 return NF_STOLEN;
58 } 82 }
59 return NF_ACCEPT; 83 return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index fe1a64479dd0..4f8bddb760c9 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -12,6 +12,7 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/timer.h> 13#include <linux/timer.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/gfp.h>
15#include <net/checksum.h> 16#include <net/checksum.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -30,14 +31,12 @@
30#include <net/netfilter/nf_conntrack_helper.h> 31#include <net/netfilter/nf_conntrack_helper.h>
31#include <net/netfilter/nf_conntrack_l3proto.h> 32#include <net/netfilter/nf_conntrack_l3proto.h>
32#include <net/netfilter/nf_conntrack_l4proto.h> 33#include <net/netfilter/nf_conntrack_l4proto.h>
34#include <net/netfilter/nf_conntrack_zones.h>
33 35
34static DEFINE_SPINLOCK(nf_nat_lock); 36static DEFINE_SPINLOCK(nf_nat_lock);
35 37
36static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
37 39
38/* Calculated at init based on memory size */
39static unsigned int nf_nat_htable_size __read_mostly;
40
41#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
42static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
43 __read_mostly; 42 __read_mostly;
@@ -72,15 +71,16 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put);
72 71
73/* We keep an extra hash for each conntrack, for fast searching. */ 72/* We keep an extra hash for each conntrack, for fast searching. */
74static inline unsigned int 73static inline unsigned int
75hash_by_src(const struct nf_conntrack_tuple *tuple) 74hash_by_src(const struct net *net, u16 zone,
75 const struct nf_conntrack_tuple *tuple)
76{ 76{
77 unsigned int hash; 77 unsigned int hash;
78 78
79 /* Original src, to ensure we map it consistently if poss. */ 79 /* Original src, to ensure we map it consistently if poss. */
80 hash = jhash_3words((__force u32)tuple->src.u3.ip, 80 hash = jhash_3words((__force u32)tuple->src.u3.ip,
81 (__force u32)tuple->src.u.all, 81 (__force u32)tuple->src.u.all ^ zone,
82 tuple->dst.protonum, 0); 82 tuple->dst.protonum, 0);
83 return ((u64)hash * nf_nat_htable_size) >> 32; 83 return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
84} 84}
85 85
86/* Is this tuple already taken? (not by us) */ 86/* Is this tuple already taken? (not by us) */
@@ -142,12 +142,12 @@ same_src(const struct nf_conn *ct,
142 142
143/* Only called for SRC manip */ 143/* Only called for SRC manip */
144static int 144static int
145find_appropriate_src(struct net *net, 145find_appropriate_src(struct net *net, u16 zone,
146 const struct nf_conntrack_tuple *tuple, 146 const struct nf_conntrack_tuple *tuple,
147 struct nf_conntrack_tuple *result, 147 struct nf_conntrack_tuple *result,
148 const struct nf_nat_range *range) 148 const struct nf_nat_range *range)
149{ 149{
150 unsigned int h = hash_by_src(tuple); 150 unsigned int h = hash_by_src(net, zone, tuple);
151 const struct nf_conn_nat *nat; 151 const struct nf_conn_nat *nat;
152 const struct nf_conn *ct; 152 const struct nf_conn *ct;
153 const struct hlist_node *n; 153 const struct hlist_node *n;
@@ -155,7 +155,7 @@ find_appropriate_src(struct net *net,
155 rcu_read_lock(); 155 rcu_read_lock();
156 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { 156 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
157 ct = nat->ct; 157 ct = nat->ct;
158 if (same_src(ct, tuple)) { 158 if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
159 /* Copy source part from reply tuple. */ 159 /* Copy source part from reply tuple. */
160 nf_ct_invert_tuplepr(result, 160 nf_ct_invert_tuplepr(result,
161 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 161 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -178,7 +178,7 @@ find_appropriate_src(struct net *net,
178 the ip with the lowest src-ip/dst-ip/proto usage. 178 the ip with the lowest src-ip/dst-ip/proto usage.
179*/ 179*/
180static void 180static void
181find_best_ips_proto(struct nf_conntrack_tuple *tuple, 181find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
182 const struct nf_nat_range *range, 182 const struct nf_nat_range *range,
183 const struct nf_conn *ct, 183 const struct nf_conn *ct,
184 enum nf_nat_manip_type maniptype) 184 enum nf_nat_manip_type maniptype)
@@ -212,7 +212,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple,
212 maxip = ntohl(range->max_ip); 212 maxip = ntohl(range->max_ip);
213 j = jhash_2words((__force u32)tuple->src.u3.ip, 213 j = jhash_2words((__force u32)tuple->src.u3.ip,
214 range->flags & IP_NAT_RANGE_PERSISTENT ? 214 range->flags & IP_NAT_RANGE_PERSISTENT ?
215 0 : (__force u32)tuple->dst.u3.ip, 0); 215 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
216 j = ((u64)j * (maxip - minip + 1)) >> 32; 216 j = ((u64)j * (maxip - minip + 1)) >> 32;
217 *var_ipp = htonl(minip + j); 217 *var_ipp = htonl(minip + j);
218} 218}
@@ -232,6 +232,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
232{ 232{
233 struct net *net = nf_ct_net(ct); 233 struct net *net = nf_ct_net(ct);
234 const struct nf_nat_protocol *proto; 234 const struct nf_nat_protocol *proto;
235 u16 zone = nf_ct_zone(ct);
235 236
236 /* 1) If this srcip/proto/src-proto-part is currently mapped, 237 /* 1) If this srcip/proto/src-proto-part is currently mapped,
237 and that same mapping gives a unique tuple within the given 238 and that same mapping gives a unique tuple within the given
@@ -242,7 +243,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
242 manips not an issue. */ 243 manips not an issue. */
243 if (maniptype == IP_NAT_MANIP_SRC && 244 if (maniptype == IP_NAT_MANIP_SRC &&
244 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 245 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
245 if (find_appropriate_src(net, orig_tuple, tuple, range)) { 246 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
246 pr_debug("get_unique_tuple: Found current src map\n"); 247 pr_debug("get_unique_tuple: Found current src map\n");
247 if (!nf_nat_used_tuple(tuple, ct)) 248 if (!nf_nat_used_tuple(tuple, ct))
248 return; 249 return;
@@ -252,7 +253,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
252 /* 2) Select the least-used IP/proto combination in the given 253 /* 2) Select the least-used IP/proto combination in the given
253 range. */ 254 range. */
254 *tuple = *orig_tuple; 255 *tuple = *orig_tuple;
255 find_best_ips_proto(tuple, range, ct, maniptype); 256 find_best_ips_proto(zone, tuple, range, ct, maniptype);
256 257
257 /* 3) The per-protocol part of the manip is made to map into 258 /* 3) The per-protocol part of the manip is made to map into
258 the range to make a unique tuple. */ 259 the range to make a unique tuple. */
@@ -330,7 +331,8 @@ nf_nat_setup_info(struct nf_conn *ct,
330 if (have_to_hash) { 331 if (have_to_hash) {
331 unsigned int srchash; 332 unsigned int srchash;
332 333
333 srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 334 srchash = hash_by_src(net, nf_ct_zone(ct),
335 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
334 spin_lock_bh(&nf_nat_lock); 336 spin_lock_bh(&nf_nat_lock);
335 /* nf_conntrack_alter_reply might re-allocate exntension aera */ 337 /* nf_conntrack_alter_reply might re-allocate exntension aera */
336 nat = nfct_nat(ct); 338 nat = nfct_nat(ct);
@@ -679,8 +681,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
679 681
680static int __net_init nf_nat_net_init(struct net *net) 682static int __net_init nf_nat_net_init(struct net *net)
681{ 683{
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 684 /* Leave them the same for the moment. */
683 &net->ipv4.nat_vmalloced, 0); 685 net->ipv4.nat_htable_size = net->ct.htable_size;
686 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
687 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 688 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 689 return -ENOMEM;
686 return 0; 690 return 0;
@@ -703,7 +707,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 707 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 708 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 709 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
706 nf_nat_htable_size); 710 net->ipv4.nat_htable_size);
707} 711}
708 712
709static struct pernet_operations nf_nat_net_ops = { 713static struct pernet_operations nf_nat_net_ops = {
@@ -724,9 +728,6 @@ static int __init nf_nat_init(void)
724 return ret; 728 return ret;
725 } 729 }
726 730
727 /* Leave them the same for the moment. */
728 nf_nat_htable_size = nf_conntrack_htable_size;
729
730 ret = register_pernet_subsys(&nf_nat_net_ops); 731 ret = register_pernet_subsys(&nf_nat_net_ops);
731 if (ret < 0) 732 if (ret < 0)
732 goto cleanup_extend; 733 goto cleanup_extend;
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index a1d5d58a58bf..86e0e84ff0a0 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -27,76 +27,29 @@ MODULE_ALIAS("ip_nat_ftp");
27 27
28/* FIXME: Time out? --RR */ 28/* FIXME: Time out? --RR */
29 29
30static int 30static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
31mangle_rfc959_packet(struct sk_buff *skb, 31 char *buffer, size_t buflen,
32 __be32 newip, 32 __be32 addr, u16 port)
33 u_int16_t port,
34 unsigned int matchoff,
35 unsigned int matchlen,
36 struct nf_conn *ct,
37 enum ip_conntrack_info ctinfo)
38{ 33{
39 char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; 34 switch (type) {
40 35 case NF_CT_FTP_PORT:
41 sprintf(buffer, "%u,%u,%u,%u,%u,%u", 36 case NF_CT_FTP_PASV:
42 NIPQUAD(newip), port>>8, port&0xFF); 37 return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
43 38 ((unsigned char *)&addr)[0],
44 pr_debug("calling nf_nat_mangle_tcp_packet\n"); 39 ((unsigned char *)&addr)[1],
45 40 ((unsigned char *)&addr)[2],
46 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, 41 ((unsigned char *)&addr)[3],
47 matchlen, buffer, strlen(buffer)); 42 port >> 8,
48} 43 port & 0xFF);
49 44 case NF_CT_FTP_EPRT:
50/* |1|132.235.1.2|6275| */ 45 return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
51static int 46 case NF_CT_FTP_EPSV:
52mangle_eprt_packet(struct sk_buff *skb, 47 return snprintf(buffer, buflen, "|||%u|", port);
53 __be32 newip, 48 }
54 u_int16_t port,
55 unsigned int matchoff,
56 unsigned int matchlen,
57 struct nf_conn *ct,
58 enum ip_conntrack_info ctinfo)
59{
60 char buffer[sizeof("|1|255.255.255.255|65535|")];
61
62 sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
63
64 pr_debug("calling nf_nat_mangle_tcp_packet\n");
65
66 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
67 matchlen, buffer, strlen(buffer));
68}
69
70/* |1|132.235.1.2|6275| */
71static int
72mangle_epsv_packet(struct sk_buff *skb,
73 __be32 newip,
74 u_int16_t port,
75 unsigned int matchoff,
76 unsigned int matchlen,
77 struct nf_conn *ct,
78 enum ip_conntrack_info ctinfo)
79{
80 char buffer[sizeof("|||65535|")];
81
82 sprintf(buffer, "|||%u|", port);
83
84 pr_debug("calling nf_nat_mangle_tcp_packet\n");
85 49
86 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, 50 return 0;
87 matchlen, buffer, strlen(buffer));
88} 51}
89 52
90static int (*mangle[])(struct sk_buff *, __be32, u_int16_t,
91 unsigned int, unsigned int, struct nf_conn *,
92 enum ip_conntrack_info)
93= {
94 [NF_CT_FTP_PORT] = mangle_rfc959_packet,
95 [NF_CT_FTP_PASV] = mangle_rfc959_packet,
96 [NF_CT_FTP_EPRT] = mangle_eprt_packet,
97 [NF_CT_FTP_EPSV] = mangle_epsv_packet
98};
99
100/* So, this packet has hit the connection tracking matching code. 53/* So, this packet has hit the connection tracking matching code.
101 Mangle it, and change the expectation to match the new version. */ 54 Mangle it, and change the expectation to match the new version. */
102static unsigned int nf_nat_ftp(struct sk_buff *skb, 55static unsigned int nf_nat_ftp(struct sk_buff *skb,
@@ -110,6 +63,8 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
110 u_int16_t port; 63 u_int16_t port;
111 int dir = CTINFO2DIR(ctinfo); 64 int dir = CTINFO2DIR(ctinfo);
112 struct nf_conn *ct = exp->master; 65 struct nf_conn *ct = exp->master;
66 char buffer[sizeof("|1|255.255.255.255|65535|")];
67 unsigned int buflen;
113 68
114 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); 69 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
115 70
@@ -132,11 +87,21 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
132 if (port == 0) 87 if (port == 0)
133 return NF_DROP; 88 return NF_DROP;
134 89
135 if (!mangle[type](skb, newip, port, matchoff, matchlen, ct, ctinfo)) { 90 buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
136 nf_ct_unexpect_related(exp); 91 if (!buflen)
137 return NF_DROP; 92 goto out;
138 } 93
94 pr_debug("calling nf_nat_mangle_tcp_packet\n");
95
96 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
97 matchlen, buffer, buflen))
98 goto out;
99
139 return NF_ACCEPT; 100 return NF_ACCEPT;
101
102out:
103 nf_ct_unexpect_related(exp);
104 return NF_DROP;
140} 105}
141 106
142static void __exit nf_nat_ftp_fini(void) 107static void __exit nf_nat_ftp_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index f9520fa3aba9..4a0c6b548eee 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -8,6 +8,7 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/gfp.h>
11#include <linux/kmod.h> 12#include <linux/kmod.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/timer.h> 14#include <linux/timer.h>
@@ -41,18 +42,14 @@ adjust_tcp_sequence(u32 seq,
41 struct nf_conn *ct, 42 struct nf_conn *ct,
42 enum ip_conntrack_info ctinfo) 43 enum ip_conntrack_info ctinfo)
43{ 44{
44 int dir; 45 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
45 struct nf_nat_seq *this_way, *other_way;
46 struct nf_conn_nat *nat = nfct_nat(ct); 46 struct nf_conn_nat *nat = nfct_nat(ct);
47 struct nf_nat_seq *this_way = &nat->seq[dir];
47 48
48 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", seq, seq); 49 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
49 50 seq, sizediff);
50 dir = CTINFO2DIR(ctinfo);
51
52 this_way = &nat->seq[dir];
53 other_way = &nat->seq[!dir];
54 51
55 pr_debug("nf_nat_resize_packet: Seq_offset before: "); 52 pr_debug("adjust_tcp_sequence: Seq_offset before: ");
56 DUMP_OFFSET(this_way); 53 DUMP_OFFSET(this_way);
57 54
58 spin_lock_bh(&nf_nat_seqofs_lock); 55 spin_lock_bh(&nf_nat_seqofs_lock);
@@ -63,13 +60,13 @@ adjust_tcp_sequence(u32 seq,
63 * retransmit */ 60 * retransmit */
64 if (this_way->offset_before == this_way->offset_after || 61 if (this_way->offset_before == this_way->offset_after ||
65 before(this_way->correction_pos, seq)) { 62 before(this_way->correction_pos, seq)) {
66 this_way->correction_pos = seq; 63 this_way->correction_pos = seq;
67 this_way->offset_before = this_way->offset_after; 64 this_way->offset_before = this_way->offset_after;
68 this_way->offset_after += sizediff; 65 this_way->offset_after += sizediff;
69 } 66 }
70 spin_unlock_bh(&nf_nat_seqofs_lock); 67 spin_unlock_bh(&nf_nat_seqofs_lock);
71 68
72 pr_debug("nf_nat_resize_packet: Seq_offset after: "); 69 pr_debug("adjust_tcp_sequence: Seq_offset after: ");
73 DUMP_OFFSET(this_way); 70 DUMP_OFFSET(this_way);
74} 71}
75 72
@@ -145,6 +142,17 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
145 return 1; 142 return 1;
146} 143}
147 144
145void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
146 __be32 seq, s16 off)
147{
148 if (!off)
149 return;
150 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
151 adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
152 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155
148/* Generic function for mangling variable-length address changes inside 156/* Generic function for mangling variable-length address changes inside
149 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX 157 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
150 * command in FTP). 158 * command in FTP).
@@ -153,14 +161,13 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
153 * skb enlargement, ... 161 * skb enlargement, ...
154 * 162 *
155 * */ 163 * */
156int 164int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
157nf_nat_mangle_tcp_packet(struct sk_buff *skb, 165 struct nf_conn *ct,
158 struct nf_conn *ct, 166 enum ip_conntrack_info ctinfo,
159 enum ip_conntrack_info ctinfo, 167 unsigned int match_offset,
160 unsigned int match_offset, 168 unsigned int match_len,
161 unsigned int match_len, 169 const char *rep_buffer,
162 const char *rep_buffer, 170 unsigned int rep_len, bool adjust)
163 unsigned int rep_len)
164{ 171{
165 struct rtable *rt = skb_rtable(skb); 172 struct rtable *rt = skb_rtable(skb);
166 struct iphdr *iph; 173 struct iphdr *iph;
@@ -206,16 +213,13 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
206 inet_proto_csum_replace2(&tcph->check, skb, 213 inet_proto_csum_replace2(&tcph->check, skb,
207 htons(oldlen), htons(datalen), 1); 214 htons(oldlen), htons(datalen), 1);
208 215
209 if (rep_len != match_len) { 216 if (adjust && rep_len != match_len)
210 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); 217 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
211 adjust_tcp_sequence(ntohl(tcph->seq), 218 (int)rep_len - (int)match_len);
212 (int)rep_len - (int)match_len, 219
213 ct, ctinfo);
214 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
215 }
216 return 1; 220 return 1;
217} 221}
218EXPORT_SYMBOL(nf_nat_mangle_tcp_packet); 222EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
219 223
220/* Generic function for mangling variable-length address changes inside 224/* Generic function for mangling variable-length address changes inside
221 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX 225 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 9eb171056c63..4c060038d29f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -25,6 +25,7 @@
25#include <net/netfilter/nf_nat_rule.h> 25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_conntrack_helper.h> 26#include <net/netfilter/nf_conntrack_helper.h>
27#include <net/netfilter/nf_conntrack_expect.h> 27#include <net/netfilter/nf_conntrack_expect.h>
28#include <net/netfilter/nf_conntrack_zones.h>
28#include <linux/netfilter/nf_conntrack_proto_gre.h> 29#include <linux/netfilter/nf_conntrack_proto_gre.h>
29#include <linux/netfilter/nf_conntrack_pptp.h> 30#include <linux/netfilter/nf_conntrack_pptp.h>
30 31
@@ -74,7 +75,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
74 75
75 pr_debug("trying to unexpect other dir: "); 76 pr_debug("trying to unexpect other dir: ");
76 nf_ct_dump_tuple_ip(&t); 77 nf_ct_dump_tuple_ip(&t);
77 other_exp = nf_ct_expect_find_get(net, &t); 78 other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
78 if (other_exp) { 79 if (other_exp) {
79 nf_ct_unexpect_related(other_exp); 80 nf_ct_unexpect_related(other_exp);
80 nf_ct_expect_put(other_exp); 81 nf_ct_expect_put(other_exp);
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 9e81e0dfb4ec..26de2c1f7fab 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -15,6 +15,7 @@
15#include <linux/kmod.h> 15#include <linux/kmod.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/slab.h>
18#include <net/checksum.h> 19#include <net/checksum.h>
19#include <net/route.h> 20#include <net/route.h>
20#include <linux/bitops.h> 21#include <linux/bitops.h>
@@ -28,36 +29,6 @@
28 (1 << NF_INET_POST_ROUTING) | \ 29 (1 << NF_INET_POST_ROUTING) | \
29 (1 << NF_INET_LOCAL_OUT)) 30 (1 << NF_INET_LOCAL_OUT))
30 31
31static const struct
32{
33 struct ipt_replace repl;
34 struct ipt_standard entries[3];
35 struct ipt_error term;
36} nat_initial_table __net_initdata = {
37 .repl = {
38 .name = "nat",
39 .valid_hooks = NAT_VALID_HOOKS,
40 .num_entries = 4,
41 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
42 .hook_entry = {
43 [NF_INET_PRE_ROUTING] = 0,
44 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
45 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
46 },
47 .underflow = {
48 [NF_INET_PRE_ROUTING] = 0,
49 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
50 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
51 },
52 },
53 .entries = {
54 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
55 IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
56 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
57 },
58 .term = IPT_ERROR_INIT, /* ERROR */
59};
60
61static const struct xt_table nat_table = { 32static const struct xt_table nat_table = {
62 .name = "nat", 33 .name = "nat",
63 .valid_hooks = NAT_VALID_HOOKS, 34 .valid_hooks = NAT_VALID_HOOKS,
@@ -186,8 +157,13 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
186 157
187static int __net_init nf_nat_rule_net_init(struct net *net) 158static int __net_init nf_nat_rule_net_init(struct net *net)
188{ 159{
189 net->ipv4.nat_table = ipt_register_table(net, &nat_table, 160 struct ipt_replace *repl;
190 &nat_initial_table.repl); 161
162 repl = ipt_alloc_initial_table(&nat_table);
163 if (repl == NULL)
164 return -ENOMEM;
165 net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
166 kfree(repl);
191 if (IS_ERR(net->ipv4.nat_table)) 167 if (IS_ERR(net->ipv4.nat_table))
192 return PTR_ERR(net->ipv4.nat_table); 168 return PTR_ERR(net->ipv4.nat_table);
193 return 0; 169 return 0;
@@ -195,7 +171,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net)
195 171
196static void __net_exit nf_nat_rule_net_exit(struct net *net) 172static void __net_exit nf_nat_rule_net_exit(struct net *net)
197{ 173{
198 ipt_unregister_table(net->ipv4.nat_table); 174 ipt_unregister_table(net, net->ipv4.nat_table);
199} 175}
200 176
201static struct pernet_operations nf_nat_rule_net_ops = { 177static struct pernet_operations nf_nat_rule_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 07d61a57613c..11b538deaaec 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -1,4 +1,4 @@
1/* SIP extension for UDP NAT alteration. 1/* SIP extension for NAT alteration.
2 * 2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> 3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules. 4 * based on RR's ip_nat_ftp.c and other modules.
@@ -15,6 +15,7 @@
15#include <linux/ip.h> 15#include <linux/ip.h>
16#include <net/ip.h> 16#include <net/ip.h>
17#include <linux/udp.h> 17#include <linux/udp.h>
18#include <linux/tcp.h>
18 19
19#include <net/netfilter/nf_nat.h> 20#include <net/netfilter/nf_nat.h>
20#include <net/netfilter/nf_nat_helper.h> 21#include <net/netfilter/nf_nat_helper.h>
@@ -29,25 +30,42 @@ MODULE_DESCRIPTION("SIP NAT helper");
29MODULE_ALIAS("ip_nat_sip"); 30MODULE_ALIAS("ip_nat_sip");
30 31
31 32
32static unsigned int mangle_packet(struct sk_buff *skb, 33static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
33 const char **dptr, unsigned int *datalen, 34 const char **dptr, unsigned int *datalen,
34 unsigned int matchoff, unsigned int matchlen, 35 unsigned int matchoff, unsigned int matchlen,
35 const char *buffer, unsigned int buflen) 36 const char *buffer, unsigned int buflen)
36{ 37{
37 enum ip_conntrack_info ctinfo; 38 enum ip_conntrack_info ctinfo;
38 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 39 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
39 40 struct tcphdr *th;
40 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen, 41 unsigned int baseoff;
41 buffer, buflen)) 42
42 return 0; 43 if (nf_ct_protonum(ct) == IPPROTO_TCP) {
44 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
45 baseoff = ip_hdrlen(skb) + th->doff * 4;
46 matchoff += dataoff - baseoff;
47
48 if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
49 matchoff, matchlen,
50 buffer, buflen, false))
51 return 0;
52 } else {
53 baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
54 matchoff += dataoff - baseoff;
55
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 matchoff, matchlen,
58 buffer, buflen))
59 return 0;
60 }
43 61
44 /* Reload data pointer and adjust datalen value */ 62 /* Reload data pointer and adjust datalen value */
45 *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); 63 *dptr = skb->data + dataoff;
46 *datalen += buflen - matchlen; 64 *datalen += buflen - matchlen;
47 return 1; 65 return 1;
48} 66}
49 67
50static int map_addr(struct sk_buff *skb, 68static int map_addr(struct sk_buff *skb, unsigned int dataoff,
51 const char **dptr, unsigned int *datalen, 69 const char **dptr, unsigned int *datalen,
52 unsigned int matchoff, unsigned int matchlen, 70 unsigned int matchoff, unsigned int matchlen,
53 union nf_inet_addr *addr, __be16 port) 71 union nf_inet_addr *addr, __be16 port)
@@ -76,11 +94,11 @@ static int map_addr(struct sk_buff *skb,
76 94
77 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); 95 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
78 96
79 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 97 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
80 buffer, buflen); 98 buffer, buflen);
81} 99}
82 100
83static int map_sip_addr(struct sk_buff *skb, 101static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
84 const char **dptr, unsigned int *datalen, 102 const char **dptr, unsigned int *datalen,
85 enum sip_header_types type) 103 enum sip_header_types type)
86{ 104{
@@ -93,16 +111,18 @@ static int map_sip_addr(struct sk_buff *skb,
93 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, 111 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
94 &matchoff, &matchlen, &addr, &port) <= 0) 112 &matchoff, &matchlen, &addr, &port) <= 0)
95 return 1; 113 return 1;
96 return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port); 114 return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
115 &addr, port);
97} 116}
98 117
99static unsigned int ip_nat_sip(struct sk_buff *skb, 118static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
100 const char **dptr, unsigned int *datalen) 119 const char **dptr, unsigned int *datalen)
101{ 120{
102 enum ip_conntrack_info ctinfo; 121 enum ip_conntrack_info ctinfo;
103 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 122 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
104 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 123 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
105 unsigned int dataoff, matchoff, matchlen; 124 unsigned int coff, matchoff, matchlen;
125 enum sip_header_types hdr;
106 union nf_inet_addr addr; 126 union nf_inet_addr addr;
107 __be16 port; 127 __be16 port;
108 int request, in_header; 128 int request, in_header;
@@ -112,16 +132,21 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
112 if (ct_sip_parse_request(ct, *dptr, *datalen, 132 if (ct_sip_parse_request(ct, *dptr, *datalen,
113 &matchoff, &matchlen, 133 &matchoff, &matchlen,
114 &addr, &port) > 0 && 134 &addr, &port) > 0 &&
115 !map_addr(skb, dptr, datalen, matchoff, matchlen, 135 !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
116 &addr, port)) 136 &addr, port))
117 return NF_DROP; 137 return NF_DROP;
118 request = 1; 138 request = 1;
119 } else 139 } else
120 request = 0; 140 request = 0;
121 141
142 if (nf_ct_protonum(ct) == IPPROTO_TCP)
143 hdr = SIP_HDR_VIA_TCP;
144 else
145 hdr = SIP_HDR_VIA_UDP;
146
122 /* Translate topmost Via header and parameters */ 147 /* Translate topmost Via header and parameters */
123 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, 148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
124 SIP_HDR_VIA, NULL, &matchoff, &matchlen, 149 hdr, NULL, &matchoff, &matchlen,
125 &addr, &port) > 0) { 150 &addr, &port) > 0) {
126 unsigned int matchend, poff, plen, buflen, n; 151 unsigned int matchend, poff, plen, buflen, n;
127 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; 152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
@@ -138,7 +163,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
138 goto next; 163 goto next;
139 } 164 }
140 165
141 if (!map_addr(skb, dptr, datalen, matchoff, matchlen, 166 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
142 &addr, port)) 167 &addr, port))
143 return NF_DROP; 168 return NF_DROP;
144 169
@@ -153,8 +178,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
153 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { 178 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
154 buflen = sprintf(buffer, "%pI4", 179 buflen = sprintf(buffer, "%pI4",
155 &ct->tuplehash[!dir].tuple.dst.u3.ip); 180 &ct->tuplehash[!dir].tuple.dst.u3.ip);
156 if (!mangle_packet(skb, dptr, datalen, poff, plen, 181 if (!mangle_packet(skb, dataoff, dptr, datalen,
157 buffer, buflen)) 182 poff, plen, buffer, buflen))
158 return NF_DROP; 183 return NF_DROP;
159 } 184 }
160 185
@@ -167,8 +192,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
167 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { 192 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
168 buflen = sprintf(buffer, "%pI4", 193 buflen = sprintf(buffer, "%pI4",
169 &ct->tuplehash[!dir].tuple.src.u3.ip); 194 &ct->tuplehash[!dir].tuple.src.u3.ip);
170 if (!mangle_packet(skb, dptr, datalen, poff, plen, 195 if (!mangle_packet(skb, dataoff, dptr, datalen,
171 buffer, buflen)) 196 poff, plen, buffer, buflen))
172 return NF_DROP; 197 return NF_DROP;
173 } 198 }
174 199
@@ -181,31 +206,45 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
181 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { 206 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
182 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; 207 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
183 buflen = sprintf(buffer, "%u", ntohs(p)); 208 buflen = sprintf(buffer, "%u", ntohs(p));
184 if (!mangle_packet(skb, dptr, datalen, poff, plen, 209 if (!mangle_packet(skb, dataoff, dptr, datalen,
185 buffer, buflen)) 210 poff, plen, buffer, buflen))
186 return NF_DROP; 211 return NF_DROP;
187 } 212 }
188 } 213 }
189 214
190next: 215next:
191 /* Translate Contact headers */ 216 /* Translate Contact headers */
192 dataoff = 0; 217 coff = 0;
193 in_header = 0; 218 in_header = 0;
194 while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen, 219 while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
195 SIP_HDR_CONTACT, &in_header, 220 SIP_HDR_CONTACT, &in_header,
196 &matchoff, &matchlen, 221 &matchoff, &matchlen,
197 &addr, &port) > 0) { 222 &addr, &port) > 0) {
198 if (!map_addr(skb, dptr, datalen, matchoff, matchlen, 223 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
199 &addr, port)) 224 &addr, port))
200 return NF_DROP; 225 return NF_DROP;
201 } 226 }
202 227
203 if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) || 228 if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
204 !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO)) 229 !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
205 return NF_DROP; 230 return NF_DROP;
231
206 return NF_ACCEPT; 232 return NF_ACCEPT;
207} 233}
208 234
235static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
236{
237 enum ip_conntrack_info ctinfo;
238 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
239 const struct tcphdr *th;
240
241 if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
242 return;
243
244 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
245 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
246}
247
209/* Handles expected signalling connections and media streams */ 248/* Handles expected signalling connections and media streams */
210static void ip_nat_sip_expected(struct nf_conn *ct, 249static void ip_nat_sip_expected(struct nf_conn *ct,
211 struct nf_conntrack_expect *exp) 250 struct nf_conntrack_expect *exp)
@@ -232,7 +271,7 @@ static void ip_nat_sip_expected(struct nf_conn *ct,
232 } 271 }
233} 272}
234 273
235static unsigned int ip_nat_sip_expect(struct sk_buff *skb, 274static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
236 const char **dptr, unsigned int *datalen, 275 const char **dptr, unsigned int *datalen,
237 struct nf_conntrack_expect *exp, 276 struct nf_conntrack_expect *exp,
238 unsigned int matchoff, 277 unsigned int matchoff,
@@ -279,8 +318,8 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
279 if (exp->tuple.dst.u3.ip != exp->saved_ip || 318 if (exp->tuple.dst.u3.ip != exp->saved_ip ||
280 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { 319 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
281 buflen = sprintf(buffer, "%pI4:%u", &newip, port); 320 buflen = sprintf(buffer, "%pI4:%u", &newip, port);
282 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 321 if (!mangle_packet(skb, dataoff, dptr, datalen,
283 buffer, buflen)) 322 matchoff, matchlen, buffer, buflen))
284 goto err; 323 goto err;
285 } 324 }
286 return NF_ACCEPT; 325 return NF_ACCEPT;
@@ -290,7 +329,7 @@ err:
290 return NF_DROP; 329 return NF_DROP;
291} 330}
292 331
293static int mangle_content_len(struct sk_buff *skb, 332static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
294 const char **dptr, unsigned int *datalen) 333 const char **dptr, unsigned int *datalen)
295{ 334{
296 enum ip_conntrack_info ctinfo; 335 enum ip_conntrack_info ctinfo;
@@ -312,12 +351,13 @@ static int mangle_content_len(struct sk_buff *skb,
312 return 0; 351 return 0;
313 352
314 buflen = sprintf(buffer, "%u", c_len); 353 buflen = sprintf(buffer, "%u", c_len);
315 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 354 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
316 buffer, buflen); 355 buffer, buflen);
317} 356}
318 357
319static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, 358static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
320 unsigned int dataoff, unsigned int *datalen, 359 const char **dptr, unsigned int *datalen,
360 unsigned int sdpoff,
321 enum sdp_header_types type, 361 enum sdp_header_types type,
322 enum sdp_header_types term, 362 enum sdp_header_types term,
323 char *buffer, int buflen) 363 char *buffer, int buflen)
@@ -326,16 +366,16 @@ static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr,
326 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 366 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
327 unsigned int matchlen, matchoff; 367 unsigned int matchlen, matchoff;
328 368
329 if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, 369 if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
330 &matchoff, &matchlen) <= 0) 370 &matchoff, &matchlen) <= 0)
331 return -ENOENT; 371 return -ENOENT;
332 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 372 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
333 buffer, buflen) ? 0 : -EINVAL; 373 buffer, buflen) ? 0 : -EINVAL;
334} 374}
335 375
336static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, 376static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
337 unsigned int dataoff, 377 const char **dptr, unsigned int *datalen,
338 unsigned int *datalen, 378 unsigned int sdpoff,
339 enum sdp_header_types type, 379 enum sdp_header_types type,
340 enum sdp_header_types term, 380 enum sdp_header_types term,
341 const union nf_inet_addr *addr) 381 const union nf_inet_addr *addr)
@@ -344,16 +384,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr,
344 unsigned int buflen; 384 unsigned int buflen;
345 385
346 buflen = sprintf(buffer, "%pI4", &addr->ip); 386 buflen = sprintf(buffer, "%pI4", &addr->ip);
347 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, 387 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
348 buffer, buflen)) 388 buffer, buflen))
349 return 0; 389 return 0;
350 390
351 return mangle_content_len(skb, dptr, datalen); 391 return mangle_content_len(skb, dataoff, dptr, datalen);
352} 392}
353 393
354static unsigned int ip_nat_sdp_port(struct sk_buff *skb, 394static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
355 const char **dptr, 395 const char **dptr, unsigned int *datalen,
356 unsigned int *datalen,
357 unsigned int matchoff, 396 unsigned int matchoff,
358 unsigned int matchlen, 397 unsigned int matchlen,
359 u_int16_t port) 398 u_int16_t port)
@@ -362,16 +401,16 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb,
362 unsigned int buflen; 401 unsigned int buflen;
363 402
364 buflen = sprintf(buffer, "%u", port); 403 buflen = sprintf(buffer, "%u", port);
365 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 404 if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
366 buffer, buflen)) 405 buffer, buflen))
367 return 0; 406 return 0;
368 407
369 return mangle_content_len(skb, dptr, datalen); 408 return mangle_content_len(skb, dataoff, dptr, datalen);
370} 409}
371 410
372static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, 411static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
373 unsigned int dataoff, 412 const char **dptr, unsigned int *datalen,
374 unsigned int *datalen, 413 unsigned int sdpoff,
375 const union nf_inet_addr *addr) 414 const union nf_inet_addr *addr)
376{ 415{
377 char buffer[sizeof("nnn.nnn.nnn.nnn")]; 416 char buffer[sizeof("nnn.nnn.nnn.nnn")];
@@ -379,12 +418,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
379 418
380 /* Mangle session description owner and contact addresses */ 419 /* Mangle session description owner and contact addresses */
381 buflen = sprintf(buffer, "%pI4", &addr->ip); 420 buflen = sprintf(buffer, "%pI4", &addr->ip);
382 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, 421 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
383 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, 422 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
384 buffer, buflen)) 423 buffer, buflen))
385 return 0; 424 return 0;
386 425
387 switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, 426 switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
388 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, 427 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
389 buffer, buflen)) { 428 buffer, buflen)) {
390 case 0: 429 case 0:
@@ -401,14 +440,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
401 return 0; 440 return 0;
402 } 441 }
403 442
404 return mangle_content_len(skb, dptr, datalen); 443 return mangle_content_len(skb, dataoff, dptr, datalen);
405} 444}
406 445
407/* So, this packet has hit the connection tracking matching code. 446/* So, this packet has hit the connection tracking matching code.
408 Mangle it, and change the expectation to match the new version. */ 447 Mangle it, and change the expectation to match the new version. */
409static unsigned int ip_nat_sdp_media(struct sk_buff *skb, 448static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
410 const char **dptr, 449 const char **dptr, unsigned int *datalen,
411 unsigned int *datalen,
412 struct nf_conntrack_expect *rtp_exp, 450 struct nf_conntrack_expect *rtp_exp,
413 struct nf_conntrack_expect *rtcp_exp, 451 struct nf_conntrack_expect *rtcp_exp,
414 unsigned int mediaoff, 452 unsigned int mediaoff,
@@ -456,7 +494,8 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb,
456 494
457 /* Update media port. */ 495 /* Update media port. */
458 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && 496 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
459 !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port)) 497 !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
498 mediaoff, medialen, port))
460 goto err2; 499 goto err2;
461 500
462 return NF_ACCEPT; 501 return NF_ACCEPT;
@@ -471,6 +510,7 @@ err1:
471static void __exit nf_nat_sip_fini(void) 510static void __exit nf_nat_sip_fini(void)
472{ 511{
473 rcu_assign_pointer(nf_nat_sip_hook, NULL); 512 rcu_assign_pointer(nf_nat_sip_hook, NULL);
513 rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL);
474 rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); 514 rcu_assign_pointer(nf_nat_sip_expect_hook, NULL);
475 rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); 515 rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL);
476 rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); 516 rcu_assign_pointer(nf_nat_sdp_port_hook, NULL);
@@ -482,12 +522,14 @@ static void __exit nf_nat_sip_fini(void)
482static int __init nf_nat_sip_init(void) 522static int __init nf_nat_sip_init(void)
483{ 523{
484 BUG_ON(nf_nat_sip_hook != NULL); 524 BUG_ON(nf_nat_sip_hook != NULL);
525 BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
485 BUG_ON(nf_nat_sip_expect_hook != NULL); 526 BUG_ON(nf_nat_sip_expect_hook != NULL);
486 BUG_ON(nf_nat_sdp_addr_hook != NULL); 527 BUG_ON(nf_nat_sdp_addr_hook != NULL);
487 BUG_ON(nf_nat_sdp_port_hook != NULL); 528 BUG_ON(nf_nat_sdp_port_hook != NULL);
488 BUG_ON(nf_nat_sdp_session_hook != NULL); 529 BUG_ON(nf_nat_sdp_session_hook != NULL);
489 BUG_ON(nf_nat_sdp_media_hook != NULL); 530 BUG_ON(nf_nat_sdp_media_hook != NULL);
490 rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); 531 rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip);
532 rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
491 rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); 533 rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect);
492 rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); 534 rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
493 rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); 535 rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index d9521f6f9ed0..4d85b6e55f29 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -43,6 +43,7 @@
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/types.h> 44#include <linux/types.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/slab.h>
46#include <linux/in.h> 47#include <linux/in.h>
47#include <linux/ip.h> 48#include <linux/ip.h>
48#include <linux/udp.h> 49#include <linux/udp.h>
@@ -1038,7 +1039,7 @@ static int snmp_parse_mangle(unsigned char *msg,
1038 unsigned int cls, con, tag, vers, pdutype; 1039 unsigned int cls, con, tag, vers, pdutype;
1039 struct asn1_ctx ctx; 1040 struct asn1_ctx ctx;
1040 struct asn1_octstr comm; 1041 struct asn1_octstr comm;
1041 struct snmp_object **obj; 1042 struct snmp_object *obj;
1042 1043
1043 if (debug > 1) 1044 if (debug > 1)
1044 hex_dump(msg, len); 1045 hex_dump(msg, len);
@@ -1148,43 +1149,34 @@ static int snmp_parse_mangle(unsigned char *msg,
1148 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) 1149 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1149 return 0; 1150 return 0;
1150 1151
1151 obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
1152 if (obj == NULL) {
1153 if (net_ratelimit())
1154 printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
1155 return 0;
1156 }
1157
1158 while (!asn1_eoc_decode(&ctx, eoc)) { 1152 while (!asn1_eoc_decode(&ctx, eoc)) {
1159 unsigned int i; 1153 unsigned int i;
1160 1154
1161 if (!snmp_object_decode(&ctx, obj)) { 1155 if (!snmp_object_decode(&ctx, &obj)) {
1162 if (*obj) { 1156 if (obj) {
1163 kfree((*obj)->id); 1157 kfree(obj->id);
1164 kfree(*obj); 1158 kfree(obj);
1165 } 1159 }
1166 kfree(obj);
1167 return 0; 1160 return 0;
1168 } 1161 }
1169 1162
1170 if (debug > 1) { 1163 if (debug > 1) {
1171 printk(KERN_DEBUG "bsalg: object: "); 1164 printk(KERN_DEBUG "bsalg: object: ");
1172 for (i = 0; i < (*obj)->id_len; i++) { 1165 for (i = 0; i < obj->id_len; i++) {
1173 if (i > 0) 1166 if (i > 0)
1174 printk("."); 1167 printk(".");
1175 printk("%lu", (*obj)->id[i]); 1168 printk("%lu", obj->id[i]);
1176 } 1169 }
1177 printk(": type=%u\n", (*obj)->type); 1170 printk(": type=%u\n", obj->type);
1178 1171
1179 } 1172 }
1180 1173
1181 if ((*obj)->type == SNMP_IPADDR) 1174 if (obj->type == SNMP_IPADDR)
1182 mangle_address(ctx.begin, ctx.pointer - 4 , map, check); 1175 mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
1183 1176
1184 kfree((*obj)->id); 1177 kfree(obj->id);
1185 kfree(*obj); 1178 kfree(obj);
1186 } 1179 }
1187 kfree(obj);
1188 1180
1189 if (!asn1_eoc_decode(&ctx, eoc)) 1181 if (!asn1_eoc_decode(&ctx, eoc))
1190 return 0; 1182 return 0;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5f41d017ddd8..c39c9cf6bee6 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/icmp.h> 9#include <linux/icmp.h>
10#include <linux/gfp.h>
10#include <linux/ip.h> 11#include <linux/ip.h>
11#include <linux/netfilter.h> 12#include <linux/netfilter.h>
12#include <linux/netfilter_ipv4.h> 13#include <linux/netfilter_ipv4.h>
@@ -197,11 +198,11 @@ nf_nat_out(unsigned int hooknum,
197 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 198 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
198 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 199 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
199 200
200 if (ct->tuplehash[dir].tuple.src.u3.ip != 201 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
201 ct->tuplehash[!dir].tuple.dst.u3.ip 202 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
202 || ct->tuplehash[dir].tuple.src.u.all != 203 (ct->tuplehash[dir].tuple.src.u.all !=
203 ct->tuplehash[!dir].tuple.dst.u.all 204 ct->tuplehash[!dir].tuple.dst.u.all)
204 ) 205 )
205 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; 206 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
206 } 207 }
207#endif 208#endif
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f25542c48b7d..4f1f337f4337 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
127 SNMP_MIB_SENTINEL 127 SNMP_MIB_SENTINEL
128}; 128};
129 129
130static struct { 130static const struct {
131 char *name; 131 const char *name;
132 int index; 132 int index;
133} icmpmibmap[] = { 133} icmpmibmap[] = {
134 { "DestUnreachs", ICMP_DEST_UNREACH }, 134 { "DestUnreachs", ICMP_DEST_UNREACH },
@@ -249,6 +249,8 @@ static const struct snmp_mib snmp4_net_list[] = {
249 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), 249 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
250 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), 250 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
251 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), 251 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
252 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
252 SNMP_MIB_SENTINEL 254 SNMP_MIB_SENTINEL
253}; 255};
254 256
@@ -280,7 +282,7 @@ static void icmpmsg_put(struct seq_file *seq)
280 282
281 count = 0; 283 count = 0;
282 for (i = 0; i < ICMPMSG_MIB_MAX; i++) { 284 for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
283 val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); 285 val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
284 if (val) { 286 if (val) {
285 type[count] = i; 287 type[count] = i;
286 vals[count++] = val; 288 vals[count++] = val;
@@ -307,18 +309,18 @@ static void icmp_put(struct seq_file *seq)
307 for (i=0; icmpmibmap[i].name != NULL; i++) 309 for (i=0; icmpmibmap[i].name != NULL; i++)
308 seq_printf(seq, " Out%s", icmpmibmap[i].name); 310 seq_printf(seq, " Out%s", icmpmibmap[i].name);
309 seq_printf(seq, "\nIcmp: %lu %lu", 311 seq_printf(seq, "\nIcmp: %lu %lu",
310 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), 312 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
311 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); 313 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
312 for (i=0; icmpmibmap[i].name != NULL; i++) 314 for (i=0; icmpmibmap[i].name != NULL; i++)
313 seq_printf(seq, " %lu", 315 seq_printf(seq, " %lu",
314 snmp_fold_field((void **) net->mib.icmpmsg_statistics, 316 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
315 icmpmibmap[i].index)); 317 icmpmibmap[i].index));
316 seq_printf(seq, " %lu %lu", 318 seq_printf(seq, " %lu %lu",
317 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 319 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
318 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 320 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
319 for (i=0; icmpmibmap[i].name != NULL; i++) 321 for (i=0; icmpmibmap[i].name != NULL; i++)
320 seq_printf(seq, " %lu", 322 seq_printf(seq, " %lu",
321 snmp_fold_field((void **) net->mib.icmpmsg_statistics, 323 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
322 icmpmibmap[i].index | 0x100)); 324 icmpmibmap[i].index | 0x100));
323} 325}
324 326
@@ -341,7 +343,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
341 343
342 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 344 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
343 seq_printf(seq, " %lu", 345 seq_printf(seq, " %lu",
344 snmp_fold_field((void **)net->mib.ip_statistics, 346 snmp_fold_field((void __percpu **)net->mib.ip_statistics,
345 snmp4_ipstats_list[i].entry)); 347 snmp4_ipstats_list[i].entry));
346 348
347 icmp_put(seq); /* RFC 2011 compatibility */ 349 icmp_put(seq); /* RFC 2011 compatibility */
@@ -356,11 +358,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
356 /* MaxConn field is signed, RFC 2012 */ 358 /* MaxConn field is signed, RFC 2012 */
357 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 359 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
358 seq_printf(seq, " %ld", 360 seq_printf(seq, " %ld",
359 snmp_fold_field((void **)net->mib.tcp_statistics, 361 snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
360 snmp4_tcp_list[i].entry)); 362 snmp4_tcp_list[i].entry));
361 else 363 else
362 seq_printf(seq, " %lu", 364 seq_printf(seq, " %lu",
363 snmp_fold_field((void **)net->mib.tcp_statistics, 365 snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
364 snmp4_tcp_list[i].entry)); 366 snmp4_tcp_list[i].entry));
365 } 367 }
366 368
@@ -371,7 +373,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
371 seq_puts(seq, "\nUdp:"); 373 seq_puts(seq, "\nUdp:");
372 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 374 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
373 seq_printf(seq, " %lu", 375 seq_printf(seq, " %lu",
374 snmp_fold_field((void **)net->mib.udp_statistics, 376 snmp_fold_field((void __percpu **)net->mib.udp_statistics,
375 snmp4_udp_list[i].entry)); 377 snmp4_udp_list[i].entry));
376 378
377 /* the UDP and UDP-Lite MIBs are the same */ 379 /* the UDP and UDP-Lite MIBs are the same */
@@ -382,7 +384,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
382 seq_puts(seq, "\nUdpLite:"); 384 seq_puts(seq, "\nUdpLite:");
383 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 385 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
384 seq_printf(seq, " %lu", 386 seq_printf(seq, " %lu",
385 snmp_fold_field((void **)net->mib.udplite_statistics, 387 snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
386 snmp4_udp_list[i].entry)); 388 snmp4_udp_list[i].entry));
387 389
388 seq_putc(seq, '\n'); 390 seq_putc(seq, '\n');
@@ -419,7 +421,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
419 seq_puts(seq, "\nTcpExt:"); 421 seq_puts(seq, "\nTcpExt:");
420 for (i = 0; snmp4_net_list[i].name != NULL; i++) 422 for (i = 0; snmp4_net_list[i].name != NULL; i++)
421 seq_printf(seq, " %lu", 423 seq_printf(seq, " %lu",
422 snmp_fold_field((void **)net->mib.net_statistics, 424 snmp_fold_field((void __percpu **)net->mib.net_statistics,
423 snmp4_net_list[i].entry)); 425 snmp4_net_list[i].entry));
424 426
425 seq_puts(seq, "\nIpExt:"); 427 seq_puts(seq, "\nIpExt:");
@@ -429,7 +431,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
429 seq_puts(seq, "\nIpExt:"); 431 seq_puts(seq, "\nIpExt:");
430 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 432 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
431 seq_printf(seq, " %lu", 433 seq_printf(seq, " %lu",
432 snmp_fold_field((void **)net->mib.ip_statistics, 434 snmp_fold_field((void __percpu **)net->mib.ip_statistics,
433 snmp4_ipextstats_list[i].entry)); 435 snmp4_ipextstats_list[i].entry));
434 436
435 seq_putc(seq, '\n'); 437 seq_putc(seq, '\n');
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ab996f9c0fe0..cc6f097fbd5f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -60,7 +60,6 @@
60#include <net/net_namespace.h> 60#include <net/net_namespace.h>
61#include <net/dst.h> 61#include <net/dst.h>
62#include <net/sock.h> 62#include <net/sock.h>
63#include <linux/gfp.h>
64#include <linux/ip.h> 63#include <linux/ip.h>
65#include <linux/net.h> 64#include <linux/net.h>
66#include <net/ip.h> 65#include <net/ip.h>
@@ -87,7 +86,7 @@ void raw_hash_sk(struct sock *sk)
87 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 86 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
88 struct hlist_head *head; 87 struct hlist_head *head;
89 88
90 head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; 89 head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
91 90
92 write_lock_bh(&h->lock); 91 write_lock_bh(&h->lock);
93 sk_add_node(sk, head); 92 sk_add_node(sk, head);
@@ -115,9 +114,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
115 sk_for_each_from(sk, node) { 114 sk_for_each_from(sk, node) {
116 struct inet_sock *inet = inet_sk(sk); 115 struct inet_sock *inet = inet_sk(sk);
117 116
118 if (net_eq(sock_net(sk), net) && inet->num == num && 117 if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
119 !(inet->daddr && inet->daddr != raddr) && 118 !(inet->inet_daddr && inet->inet_daddr != raddr) &&
120 !(inet->rcv_saddr && inet->rcv_saddr != laddr) && 119 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
121 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) 120 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
122 goto found; /* gotcha */ 121 goto found; /* gotcha */
123 } 122 }
@@ -292,7 +291,6 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
292 /* Charge it to the socket. */ 291 /* Charge it to the socket. */
293 292
294 if (sock_queue_rcv_skb(sk, skb) < 0) { 293 if (sock_queue_rcv_skb(sk, skb) < 0) {
295 atomic_inc(&sk->sk_drops);
296 kfree_skb(skb); 294 kfree_skb(skb);
297 return NET_RX_DROP; 295 return NET_RX_DROP;
298 } 296 }
@@ -327,7 +325,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
327 int err; 325 int err;
328 326
329 if (length > rt->u.dst.dev->mtu) { 327 if (length > rt->u.dst.dev->mtu) {
330 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, 328 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
331 rt->u.dst.dev->mtu); 329 rt->u.dst.dev->mtu);
332 return -EMSGSIZE; 330 return -EMSGSIZE;
333 } 331 }
@@ -500,10 +498,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
500 err = -EDESTADDRREQ; 498 err = -EDESTADDRREQ;
501 if (sk->sk_state != TCP_ESTABLISHED) 499 if (sk->sk_state != TCP_ESTABLISHED)
502 goto out; 500 goto out;
503 daddr = inet->daddr; 501 daddr = inet->inet_daddr;
504 } 502 }
505 503
506 ipc.addr = inet->saddr; 504 ipc.addr = inet->inet_saddr;
507 ipc.opt = NULL; 505 ipc.opt = NULL;
508 ipc.shtx.flags = 0; 506 ipc.shtx.flags = 0;
509 ipc.oif = sk->sk_bound_dev_if; 507 ipc.oif = sk->sk_bound_dev_if;
@@ -645,9 +643,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
645 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && 643 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
646 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) 644 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
647 goto out; 645 goto out;
648 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 646 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
649 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 647 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
650 inet->saddr = 0; /* Use device */ 648 inet->inet_saddr = 0; /* Use device */
651 sk_dst_reset(sk); 649 sk_dst_reset(sk);
652 ret = 0; 650 ret = 0;
653out: return ret; 651out: return ret;
@@ -692,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
692 if (err) 690 if (err)
693 goto done; 691 goto done;
694 692
695 sock_recv_timestamp(msg, sk, skb); 693 sock_recv_ts_and_drops(msg, sk, skb);
696 694
697 /* Copy the address. */ 695 /* Copy the address. */
698 if (sin) { 696 if (sin) {
@@ -717,7 +715,7 @@ static int raw_init(struct sock *sk)
717{ 715{
718 struct raw_sock *rp = raw_sk(sk); 716 struct raw_sock *rp = raw_sk(sk);
719 717
720 if (inet_sk(sk)->num == IPPROTO_ICMP) 718 if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
721 memset(&rp->filter, 0, sizeof(rp->filter)); 719 memset(&rp->filter, 0, sizeof(rp->filter));
722 return 0; 720 return 0;
723} 721}
@@ -754,7 +752,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
754 char __user *optval, unsigned int optlen) 752 char __user *optval, unsigned int optlen)
755{ 753{
756 if (optname == ICMP_FILTER) { 754 if (optname == ICMP_FILTER) {
757 if (inet_sk(sk)->num != IPPROTO_ICMP) 755 if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
758 return -EOPNOTSUPP; 756 return -EOPNOTSUPP;
759 else 757 else
760 return raw_seticmpfilter(sk, optval, optlen); 758 return raw_seticmpfilter(sk, optval, optlen);
@@ -784,7 +782,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname,
784 char __user *optval, int __user *optlen) 782 char __user *optval, int __user *optlen)
785{ 783{
786 if (optname == ICMP_FILTER) { 784 if (optname == ICMP_FILTER) {
787 if (inet_sk(sk)->num != IPPROTO_ICMP) 785 if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
788 return -EOPNOTSUPP; 786 return -EOPNOTSUPP;
789 else 787 else
790 return raw_geticmpfilter(sk, optval, optlen); 788 return raw_geticmpfilter(sk, optval, optlen);
@@ -943,10 +941,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop);
943static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 941static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
944{ 942{
945 struct inet_sock *inet = inet_sk(sp); 943 struct inet_sock *inet = inet_sk(sp);
946 __be32 dest = inet->daddr, 944 __be32 dest = inet->inet_daddr,
947 src = inet->rcv_saddr; 945 src = inet->inet_rcv_saddr;
948 __u16 destp = 0, 946 __u16 destp = 0,
949 srcp = inet->num; 947 srcp = inet->inet_num;
950 948
951 seq_printf(seq, "%4d: %08X:%04X %08X:%04X" 949 seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
952 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", 950 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5b1050a5d874..cb562fdd9b9a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -90,6 +90,7 @@
90#include <linux/jhash.h> 90#include <linux/jhash.h>
91#include <linux/rcupdate.h> 91#include <linux/rcupdate.h>
92#include <linux/times.h> 92#include <linux/times.h>
93#include <linux/slab.h>
93#include <net/dst.h> 94#include <net/dst.h>
94#include <net/net_namespace.h> 95#include <net/net_namespace.h>
95#include <net/protocol.h> 96#include <net/protocol.h>
@@ -146,7 +147,6 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 147static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 148static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 149static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150 150
151 151
152static struct dst_ops ipv4_dst_ops = { 152static struct dst_ops ipv4_dst_ops = {
@@ -287,12 +287,12 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 if (!rt_hash_table[st->bucket].chain) 287 if (!rt_hash_table[st->bucket].chain)
288 continue; 288 continue;
289 rcu_read_lock_bh(); 289 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain); 290 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 while (r) { 291 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid) 293 r->rt_genid == st->genid)
294 return r; 294 return r;
295 r = rcu_dereference(r->u.dst.rt_next); 295 r = rcu_dereference_bh(r->u.dst.rt_next);
296 } 296 }
297 rcu_read_unlock_bh(); 297 rcu_read_unlock_bh();
298 } 298 }
@@ -314,7 +314,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
314 rcu_read_lock_bh(); 314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain; 315 r = rt_hash_table[st->bucket].chain;
316 } 316 }
317 return rcu_dereference(r); 317 return rcu_dereference_bh(r);
318} 318}
319 319
320static struct rtable *rt_cache_get_next(struct seq_file *seq, 320static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -513,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = {
513}; 513};
514 514
515#ifdef CONFIG_NET_CLS_ROUTE 515#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 516static int rt_acct_proc_show(struct seq_file *m, void *v)
517 int length, int *eof, void *data) 517{
518{ 518 struct ip_rt_acct *dst, *src;
519 unsigned int i; 519 unsigned int i, j;
520 520
521 if ((offset & 3) || (length & 3)) 521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 return -EIO; 522 if (!dst)
523 523 return -ENOMEM;
524 if (offset >= sizeof(struct ip_rt_acct) * 256) { 524
525 *eof = 1; 525 for_each_possible_cpu(i) {
526 return 0; 526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 } 527 for (j = 0; j < 256; j++) {
528 528 dst[j].o_bytes += src[j].o_bytes;
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 529 dst[j].o_packets += src[j].o_packets;
530 length = sizeof(struct ip_rt_acct) * 256 - offset; 530 dst[j].i_bytes += src[j].i_bytes;
531 *eof = 1; 531 dst[j].i_packets += src[j].i_packets;
532 }
532 } 533 }
533 534
534 offset /= sizeof(u32); 535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
535 536 kfree(dst);
536 if (length > 0) { 537 return 0;
537 u32 *dst = (u32 *) buffer; 538}
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545 539
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; 540static int rt_acct_proc_open(struct inode *inode, struct file *file)
547 for (j = 0; j < length/4; j++) 541{
548 dst[j] += src[j]; 542 return single_open(file, rt_acct_proc_show, NULL);
549 }
550 }
551 return length;
552} 543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
553#endif 552#endif
554 553
555static int __net_init ip_rt_do_proc_init(struct net *net) 554static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -567,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 goto err2; 566 goto err2;
568 567
569#ifdef CONFIG_NET_CLS_ROUTE 568#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net, 569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
571 ip_rt_acct_read, NULL);
572 if (!pde) 570 if (!pde)
573 goto err3; 571 goto err3;
574#endif 572#endif
@@ -588,7 +586,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 586{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 587 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 588 remove_proc_entry("rt_cache", net->proc_net);
589#ifdef CONFIG_NET_CLS_ROUTE
591 remove_proc_entry("rt_acct", net->proc_net); 590 remove_proc_entry("rt_acct", net->proc_net);
591#endif
592} 592}
593 593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = { 594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
@@ -703,7 +703,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
703 703
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{ 705{
706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); 706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
707} 707}
708 708
709static inline int rt_is_expired(struct rtable *rth) 709static inline int rt_is_expired(struct rtable *rth)
@@ -780,11 +780,30 @@ static void rt_do_flush(int process_context)
780#define FRACT_BITS 3 780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS) 781#define ONE (1UL << FRACT_BITS)
782 782
783/*
784 * Given a hash chain and an item in this hash chain,
785 * find if a previous entry has the same hash_inputs
786 * (but differs on tos, mark or oif)
787 * Returns 0 if an alias is found.
788 * Returns ONE if rth has no alias before itself.
789 */
790static int has_noalias(const struct rtable *head, const struct rtable *rth)
791{
792 const struct rtable *aux = head;
793
794 while (aux != rth) {
795 if (compare_hash_inputs(&aux->fl, &rth->fl))
796 return 0;
797 aux = aux->u.dst.rt_next;
798 }
799 return ONE;
800}
801
783static void rt_check_expire(void) 802static void rt_check_expire(void)
784{ 803{
785 static unsigned int rover; 804 static unsigned int rover;
786 unsigned int i = rover, goal; 805 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp; 806 struct rtable *rth, **rthp;
788 unsigned long samples = 0; 807 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0; 808 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta; 809 unsigned long delta;
@@ -835,15 +854,7 @@ nofree:
835 * attributes don't unfairly skew 854 * attributes don't unfairly skew
836 * the length computation 855 * the length computation
837 */ 856 */
838 for (aux = rt_hash_table[i].chain;;) { 857 length += has_noalias(rt_hash_table[i].chain, rth);
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue; 858 continue;
848 } 859 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 860 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
@@ -902,6 +913,12 @@ void rt_cache_flush(struct net *net, int delay)
902 rt_do_flush(!in_softirq()); 913 rt_do_flush(!in_softirq());
903} 914}
904 915
916/* Flush previous cache invalidated entries from the cache */
917void rt_cache_flush_batch(void)
918{
919 rt_do_flush(!in_softirq());
920}
921
905/* 922/*
906 * We change rt_genid and let gc do the cleanup 923 * We change rt_genid and let gc do the cleanup
907 */ 924 */
@@ -916,10 +933,8 @@ static void rt_secret_rebuild_oneshot(struct net *net)
916{ 933{
917 del_timer_sync(&net->ipv4.rt_secret_timer); 934 del_timer_sync(&net->ipv4.rt_secret_timer);
918 rt_cache_invalidate(net); 935 rt_cache_invalidate(net);
919 if (ip_rt_secret_interval) { 936 if (ip_rt_secret_interval)
920 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; 937 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
921 add_timer(&net->ipv4.rt_secret_timer);
922 }
923} 938}
924 939
925static void rt_emergency_hash_rebuild(struct net *net) 940static void rt_emergency_hash_rebuild(struct net *net)
@@ -1067,8 +1082,23 @@ work_done:
1067out: return 0; 1082out: return 0;
1068} 1083}
1069 1084
1085/*
1086 * Returns number of entries in a hash chain that have different hash_inputs
1087 */
1088static int slow_chain_length(const struct rtable *head)
1089{
1090 int length = 0;
1091 const struct rtable *rth = head;
1092
1093 while (rth) {
1094 length += has_noalias(head, rth);
1095 rth = rth->u.dst.rt_next;
1096 }
1097 return length >> FRACT_BITS;
1098}
1099
1070static int rt_intern_hash(unsigned hash, struct rtable *rt, 1100static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 struct rtable **rp, struct sk_buff *skb) 1101 struct rtable **rp, struct sk_buff *skb, int ifindex)
1072{ 1102{
1073 struct rtable *rth, **rthp; 1103 struct rtable *rth, **rthp;
1074 unsigned long now; 1104 unsigned long now;
@@ -1179,14 +1209,20 @@ restart:
1179 rt_free(cand); 1209 rt_free(cand);
1180 } 1210 }
1181 } else { 1211 } else {
1182 if (chain_length > rt_chain_length_max) { 1212 if (chain_length > rt_chain_length_max &&
1213 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1183 struct net *net = dev_net(rt->u.dst.dev); 1214 struct net *net = dev_net(rt->u.dst.dev);
1184 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1215 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1185 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1216 if (!rt_caching(net)) {
1186 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1217 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1187 rt->u.dst.dev->name, num); 1218 rt->u.dst.dev->name, num);
1188 } 1219 }
1189 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); 1220 rt_emergency_hash_rebuild(net);
1221 spin_unlock_bh(rt_hash_lock_addr(hash));
1222
1223 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1224 ifindex, rt_genid(net));
1225 goto restart;
1190 } 1226 }
1191 } 1227 }
1192 1228
@@ -1346,9 +1382,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1346 return; 1382 return;
1347 1383
1348 net = dev_net(dev); 1384 net = dev_net(dev);
1349 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 1385 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1350 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) 1386 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1351 || ipv4_is_zeronet(new_gw)) 1387 ipv4_is_zeronet(new_gw))
1352 goto reject_redirect; 1388 goto reject_redirect;
1353 1389
1354 if (!rt_caching(net)) 1390 if (!rt_caching(net))
@@ -1411,7 +1447,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1411 dev_hold(rt->u.dst.dev); 1447 dev_hold(rt->u.dst.dev);
1412 if (rt->idev) 1448 if (rt->idev)
1413 in_dev_hold(rt->idev); 1449 in_dev_hold(rt->idev);
1414 rt->u.dst.obsolete = 0; 1450 rt->u.dst.obsolete = -1;
1415 rt->u.dst.lastuse = jiffies; 1451 rt->u.dst.lastuse = jiffies;
1416 rt->u.dst.path = &rt->u.dst; 1452 rt->u.dst.path = &rt->u.dst;
1417 rt->u.dst.neighbour = NULL; 1453 rt->u.dst.neighbour = NULL;
@@ -1447,7 +1483,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1447 &netevent); 1483 &netevent);
1448 1484
1449 rt_del(hash, rth); 1485 rt_del(hash, rth);
1450 if (!rt_intern_hash(hash, rt, &rt, NULL)) 1486 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1451 ip_rt_put(rt); 1487 ip_rt_put(rt);
1452 goto do_next; 1488 goto do_next;
1453 } 1489 }
@@ -1476,11 +1512,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1476 struct dst_entry *ret = dst; 1512 struct dst_entry *ret = dst;
1477 1513
1478 if (rt) { 1514 if (rt) {
1479 if (dst->obsolete) { 1515 if (dst->obsolete > 0) {
1480 ip_rt_put(rt); 1516 ip_rt_put(rt);
1481 ret = NULL; 1517 ret = NULL;
1482 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1518 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1483 rt->u.dst.expires) { 1519 (rt->u.dst.expires &&
1520 time_after_eq(jiffies, rt->u.dst.expires))) {
1484 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1521 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1485 rt->fl.oif, 1522 rt->fl.oif,
1486 rt_genid(dev_net(dst->dev))); 1523 rt_genid(dev_net(dst->dev)));
@@ -1628,9 +1665,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1628 __be32 daddr = iph->daddr; 1665 __be32 daddr = iph->daddr;
1629 unsigned short est_mtu = 0; 1666 unsigned short est_mtu = 0;
1630 1667
1631 if (ipv4_config.no_pmtu_disc)
1632 return 0;
1633
1634 for (k = 0; k < 2; k++) { 1668 for (k = 0; k < 2; k++) {
1635 for (i = 0; i < 2; i++) { 1669 for (i = 0; i < 2; i++) {
1636 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1670 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
@@ -1699,7 +1733,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1699 1733
1700static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1734static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1701{ 1735{
1702 return NULL; 1736 if (rt_is_expired((struct rtable *)dst))
1737 return NULL;
1738 return dst;
1703} 1739}
1704 1740
1705static void ipv4_dst_destroy(struct dst_entry *dst) 1741static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1861,7 +1897,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1861 if (!rth) 1897 if (!rth)
1862 goto e_nobufs; 1898 goto e_nobufs;
1863 1899
1864 rth->u.dst.output= ip_rt_bug; 1900 rth->u.dst.output = ip_rt_bug;
1901 rth->u.dst.obsolete = -1;
1865 1902
1866 atomic_set(&rth->u.dst.__refcnt, 1); 1903 atomic_set(&rth->u.dst.__refcnt, 1);
1867 rth->u.dst.flags= DST_HOST; 1904 rth->u.dst.flags= DST_HOST;
@@ -1900,7 +1937,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1900 1937
1901 in_dev_put(in_dev); 1938 in_dev_put(in_dev);
1902 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1939 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1903 return rt_intern_hash(hash, rth, NULL, skb); 1940 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1904 1941
1905e_nobufs: 1942e_nobufs:
1906 in_dev_put(in_dev); 1943 in_dev_put(in_dev);
@@ -1987,8 +2024,13 @@ static int __mkroute_input(struct sk_buff *skb,
1987 if (skb->protocol != htons(ETH_P_IP)) { 2024 if (skb->protocol != htons(ETH_P_IP)) {
1988 /* Not IP (i.e. ARP). Do not create route, if it is 2025 /* Not IP (i.e. ARP). Do not create route, if it is
1989 * invalid for proxy arp. DNAT routes are always valid. 2026 * invalid for proxy arp. DNAT routes are always valid.
2027 *
2028 * Proxy arp feature have been extended to allow, ARP
2029 * replies back to the same interface, to support
2030 * Private VLAN switch technologies. See arp.c.
1990 */ 2031 */
1991 if (out_dev == in_dev) { 2032 if (out_dev == in_dev &&
2033 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1992 err = -EINVAL; 2034 err = -EINVAL;
1993 goto cleanup; 2035 goto cleanup;
1994 } 2036 }
@@ -2022,6 +2064,7 @@ static int __mkroute_input(struct sk_buff *skb,
2022 rth->fl.oif = 0; 2064 rth->fl.oif = 0;
2023 rth->rt_spec_dst= spec_dst; 2065 rth->rt_spec_dst= spec_dst;
2024 2066
2067 rth->u.dst.obsolete = -1;
2025 rth->u.dst.input = ip_forward; 2068 rth->u.dst.input = ip_forward;
2026 rth->u.dst.output = ip_output; 2069 rth->u.dst.output = ip_output;
2027 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2070 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
@@ -2061,7 +2104,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2061 /* put it into the cache */ 2104 /* put it into the cache */
2062 hash = rt_hash(daddr, saddr, fl->iif, 2105 hash = rt_hash(daddr, saddr, fl->iif,
2063 rt_genid(dev_net(rth->u.dst.dev))); 2106 rt_genid(dev_net(rth->u.dst.dev)));
2064 return rt_intern_hash(hash, rth, NULL, skb); 2107 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2065} 2108}
2066 2109
2067/* 2110/*
@@ -2186,6 +2229,7 @@ local_input:
2186 goto e_nobufs; 2229 goto e_nobufs;
2187 2230
2188 rth->u.dst.output= ip_rt_bug; 2231 rth->u.dst.output= ip_rt_bug;
2232 rth->u.dst.obsolete = -1;
2189 rth->rt_genid = rt_genid(net); 2233 rth->rt_genid = rt_genid(net);
2190 2234
2191 atomic_set(&rth->u.dst.__refcnt, 1); 2235 atomic_set(&rth->u.dst.__refcnt, 1);
@@ -2217,7 +2261,7 @@ local_input:
2217 } 2261 }
2218 rth->rt_type = res.type; 2262 rth->rt_type = res.type;
2219 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2263 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2220 err = rt_intern_hash(hash, rth, NULL, skb); 2264 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2221 goto done; 2265 goto done;
2222 2266
2223no_route: 2267no_route:
@@ -2314,10 +2358,11 @@ skip_cache:
2314 ip_hdr(skb)->protocol); 2358 ip_hdr(skb)->protocol);
2315 if (our 2359 if (our
2316#ifdef CONFIG_IP_MROUTE 2360#ifdef CONFIG_IP_MROUTE
2317 || (!ipv4_is_local_multicast(daddr) && 2361 ||
2318 IN_DEV_MFORWARD(in_dev)) 2362 (!ipv4_is_local_multicast(daddr) &&
2363 IN_DEV_MFORWARD(in_dev))
2319#endif 2364#endif
2320 ) { 2365 ) {
2321 rcu_read_unlock(); 2366 rcu_read_unlock();
2322 return ip_route_input_mc(skb, daddr, saddr, 2367 return ip_route_input_mc(skb, daddr, saddr,
2323 tos, dev, our); 2368 tos, dev, our);
@@ -2411,6 +2456,7 @@ static int __mkroute_output(struct rtable **result,
2411 rth->rt_spec_dst= fl->fl4_src; 2456 rth->rt_spec_dst= fl->fl4_src;
2412 2457
2413 rth->u.dst.output=ip_output; 2458 rth->u.dst.output=ip_output;
2459 rth->u.dst.obsolete = -1;
2414 rth->rt_genid = rt_genid(dev_net(dev_out)); 2460 rth->rt_genid = rt_genid(dev_net(dev_out));
2415 2461
2416 RT_CACHE_STAT_INC(out_slow_tot); 2462 RT_CACHE_STAT_INC(out_slow_tot);
@@ -2462,7 +2508,7 @@ static int ip_mkroute_output(struct rtable **rp,
2462 if (err == 0) { 2508 if (err == 0) {
2463 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, 2509 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2464 rt_genid(dev_net(dev_out))); 2510 rt_genid(dev_net(dev_out)));
2465 err = rt_intern_hash(hash, rth, rp, NULL); 2511 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2466 } 2512 }
2467 2513
2468 return err; 2514 return err;
@@ -2514,9 +2560,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2514 of another iface. --ANK 2560 of another iface. --ANK
2515 */ 2561 */
2516 2562
2517 if (oldflp->oif == 0 2563 if (oldflp->oif == 0 &&
2518 && (ipv4_is_multicast(oldflp->fl4_dst) || 2564 (ipv4_is_multicast(oldflp->fl4_dst) ||
2519 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2565 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2520 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2521 dev_out = ip_dev_find(net, oldflp->fl4_src); 2567 dev_out = ip_dev_find(net, oldflp->fl4_src);
2522 if (dev_out == NULL) 2568 if (dev_out == NULL)
@@ -2685,8 +2731,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2685 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2731 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2686 2732
2687 rcu_read_lock_bh(); 2733 rcu_read_lock_bh();
2688 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2734 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2689 rth = rcu_dereference(rth->u.dst.rt_next)) { 2735 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2690 if (rth->fl.fl4_dst == flp->fl4_dst && 2736 if (rth->fl.fl4_dst == flp->fl4_dst &&
2691 rth->fl.fl4_src == flp->fl4_src && 2737 rth->fl.fl4_src == flp->fl4_src &&
2692 rth->fl.iif == 0 && 2738 rth->fl.iif == 0 &&
@@ -2855,7 +2901,7 @@ static int rt_fill_info(struct net *net,
2855 error = rt->u.dst.error; 2901 error = rt->u.dst.error;
2856 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2902 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2857 if (rt->peer) { 2903 if (rt->peer) {
2858 id = rt->peer->ip_id_count; 2904 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2859 if (rt->peer->tcp_ts_stamp) { 2905 if (rt->peer->tcp_ts_stamp) {
2860 ts = rt->peer->tcp_ts; 2906 ts = rt->peer->tcp_ts;
2861 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2907 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
@@ -3004,8 +3050,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3004 if (!rt_hash_table[h].chain) 3050 if (!rt_hash_table[h].chain)
3005 continue; 3051 continue;
3006 rcu_read_lock_bh(); 3052 rcu_read_lock_bh();
3007 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 3053 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3008 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 3054 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3009 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 3055 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3010 continue; 3056 continue;
3011 if (rt_is_expired(rt)) 3057 if (rt_is_expired(rt))
@@ -3056,23 +3102,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3056 return -EINVAL; 3102 return -EINVAL;
3057} 3103}
3058 3104
3059static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3060 void __user *oldval,
3061 size_t __user *oldlenp,
3062 void __user *newval,
3063 size_t newlen)
3064{
3065 int delay;
3066 struct net *net;
3067 if (newlen != sizeof(int))
3068 return -EINVAL;
3069 if (get_user(delay, (int __user *)newval))
3070 return -EFAULT;
3071 net = (struct net *)table->extra1;
3072 rt_cache_flush(net, delay);
3073 return 0;
3074}
3075
3076static void rt_secret_reschedule(int old) 3105static void rt_secret_reschedule(int old)
3077{ 3106{
3078 struct net *net; 3107 struct net *net;
@@ -3085,22 +3114,20 @@ static void rt_secret_reschedule(int old)
3085 rtnl_lock(); 3114 rtnl_lock();
3086 for_each_net(net) { 3115 for_each_net(net) {
3087 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); 3116 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3117 long time;
3088 3118
3089 if (!new) 3119 if (!new)
3090 continue; 3120 continue;
3091 3121
3092 if (deleted) { 3122 if (deleted) {
3093 long time = net->ipv4.rt_secret_timer.expires - jiffies; 3123 time = net->ipv4.rt_secret_timer.expires - jiffies;
3094 3124
3095 if (time <= 0 || (time += diff) <= 0) 3125 if (time <= 0 || (time += diff) <= 0)
3096 time = 0; 3126 time = 0;
3097
3098 net->ipv4.rt_secret_timer.expires = time;
3099 } else 3127 } else
3100 net->ipv4.rt_secret_timer.expires = new; 3128 time = new;
3101 3129
3102 net->ipv4.rt_secret_timer.expires += jiffies; 3130 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
3103 add_timer(&net->ipv4.rt_secret_timer);
3104 } 3131 }
3105 rtnl_unlock(); 3132 rtnl_unlock();
3106} 3133}
@@ -3117,23 +3144,8 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3117 return ret; 3144 return ret;
3118} 3145}
3119 3146
3120static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3121 void __user *oldval,
3122 size_t __user *oldlenp,
3123 void __user *newval,
3124 size_t newlen)
3125{
3126 int old = ip_rt_secret_interval;
3127 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3128
3129 rt_secret_reschedule(old);
3130
3131 return ret;
3132}
3133
3134static ctl_table ipv4_route_table[] = { 3147static ctl_table ipv4_route_table[] = {
3135 { 3148 {
3136 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3137 .procname = "gc_thresh", 3149 .procname = "gc_thresh",
3138 .data = &ipv4_dst_ops.gc_thresh, 3150 .data = &ipv4_dst_ops.gc_thresh,
3139 .maxlen = sizeof(int), 3151 .maxlen = sizeof(int),
@@ -3141,7 +3153,6 @@ static ctl_table ipv4_route_table[] = {
3141 .proc_handler = proc_dointvec, 3153 .proc_handler = proc_dointvec,
3142 }, 3154 },
3143 { 3155 {
3144 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3145 .procname = "max_size", 3156 .procname = "max_size",
3146 .data = &ip_rt_max_size, 3157 .data = &ip_rt_max_size,
3147 .maxlen = sizeof(int), 3158 .maxlen = sizeof(int),
@@ -3151,43 +3162,34 @@ static ctl_table ipv4_route_table[] = {
3151 { 3162 {
3152 /* Deprecated. Use gc_min_interval_ms */ 3163 /* Deprecated. Use gc_min_interval_ms */
3153 3164
3154 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3155 .procname = "gc_min_interval", 3165 .procname = "gc_min_interval",
3156 .data = &ip_rt_gc_min_interval, 3166 .data = &ip_rt_gc_min_interval,
3157 .maxlen = sizeof(int), 3167 .maxlen = sizeof(int),
3158 .mode = 0644, 3168 .mode = 0644,
3159 .proc_handler = proc_dointvec_jiffies, 3169 .proc_handler = proc_dointvec_jiffies,
3160 .strategy = sysctl_jiffies,
3161 }, 3170 },
3162 { 3171 {
3163 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3164 .procname = "gc_min_interval_ms", 3172 .procname = "gc_min_interval_ms",
3165 .data = &ip_rt_gc_min_interval, 3173 .data = &ip_rt_gc_min_interval,
3166 .maxlen = sizeof(int), 3174 .maxlen = sizeof(int),
3167 .mode = 0644, 3175 .mode = 0644,
3168 .proc_handler = proc_dointvec_ms_jiffies, 3176 .proc_handler = proc_dointvec_ms_jiffies,
3169 .strategy = sysctl_ms_jiffies,
3170 }, 3177 },
3171 { 3178 {
3172 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3173 .procname = "gc_timeout", 3179 .procname = "gc_timeout",
3174 .data = &ip_rt_gc_timeout, 3180 .data = &ip_rt_gc_timeout,
3175 .maxlen = sizeof(int), 3181 .maxlen = sizeof(int),
3176 .mode = 0644, 3182 .mode = 0644,
3177 .proc_handler = proc_dointvec_jiffies, 3183 .proc_handler = proc_dointvec_jiffies,
3178 .strategy = sysctl_jiffies,
3179 }, 3184 },
3180 { 3185 {
3181 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3182 .procname = "gc_interval", 3186 .procname = "gc_interval",
3183 .data = &ip_rt_gc_interval, 3187 .data = &ip_rt_gc_interval,
3184 .maxlen = sizeof(int), 3188 .maxlen = sizeof(int),
3185 .mode = 0644, 3189 .mode = 0644,
3186 .proc_handler = proc_dointvec_jiffies, 3190 .proc_handler = proc_dointvec_jiffies,
3187 .strategy = sysctl_jiffies,
3188 }, 3191 },
3189 { 3192 {
3190 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3191 .procname = "redirect_load", 3193 .procname = "redirect_load",
3192 .data = &ip_rt_redirect_load, 3194 .data = &ip_rt_redirect_load,
3193 .maxlen = sizeof(int), 3195 .maxlen = sizeof(int),
@@ -3195,7 +3197,6 @@ static ctl_table ipv4_route_table[] = {
3195 .proc_handler = proc_dointvec, 3197 .proc_handler = proc_dointvec,
3196 }, 3198 },
3197 { 3199 {
3198 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3199 .procname = "redirect_number", 3200 .procname = "redirect_number",
3200 .data = &ip_rt_redirect_number, 3201 .data = &ip_rt_redirect_number,
3201 .maxlen = sizeof(int), 3202 .maxlen = sizeof(int),
@@ -3203,7 +3204,6 @@ static ctl_table ipv4_route_table[] = {
3203 .proc_handler = proc_dointvec, 3204 .proc_handler = proc_dointvec,
3204 }, 3205 },
3205 { 3206 {
3206 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3207 .procname = "redirect_silence", 3207 .procname = "redirect_silence",
3208 .data = &ip_rt_redirect_silence, 3208 .data = &ip_rt_redirect_silence,
3209 .maxlen = sizeof(int), 3209 .maxlen = sizeof(int),
@@ -3211,7 +3211,6 @@ static ctl_table ipv4_route_table[] = {
3211 .proc_handler = proc_dointvec, 3211 .proc_handler = proc_dointvec,
3212 }, 3212 },
3213 { 3213 {
3214 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3215 .procname = "error_cost", 3214 .procname = "error_cost",
3216 .data = &ip_rt_error_cost, 3215 .data = &ip_rt_error_cost,
3217 .maxlen = sizeof(int), 3216 .maxlen = sizeof(int),
@@ -3219,7 +3218,6 @@ static ctl_table ipv4_route_table[] = {
3219 .proc_handler = proc_dointvec, 3218 .proc_handler = proc_dointvec,
3220 }, 3219 },
3221 { 3220 {
3222 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3223 .procname = "error_burst", 3221 .procname = "error_burst",
3224 .data = &ip_rt_error_burst, 3222 .data = &ip_rt_error_burst,
3225 .maxlen = sizeof(int), 3223 .maxlen = sizeof(int),
@@ -3227,7 +3225,6 @@ static ctl_table ipv4_route_table[] = {
3227 .proc_handler = proc_dointvec, 3225 .proc_handler = proc_dointvec,
3228 }, 3226 },
3229 { 3227 {
3230 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3231 .procname = "gc_elasticity", 3228 .procname = "gc_elasticity",
3232 .data = &ip_rt_gc_elasticity, 3229 .data = &ip_rt_gc_elasticity,
3233 .maxlen = sizeof(int), 3230 .maxlen = sizeof(int),
@@ -3235,16 +3232,13 @@ static ctl_table ipv4_route_table[] = {
3235 .proc_handler = proc_dointvec, 3232 .proc_handler = proc_dointvec,
3236 }, 3233 },
3237 { 3234 {
3238 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3239 .procname = "mtu_expires", 3235 .procname = "mtu_expires",
3240 .data = &ip_rt_mtu_expires, 3236 .data = &ip_rt_mtu_expires,
3241 .maxlen = sizeof(int), 3237 .maxlen = sizeof(int),
3242 .mode = 0644, 3238 .mode = 0644,
3243 .proc_handler = proc_dointvec_jiffies, 3239 .proc_handler = proc_dointvec_jiffies,
3244 .strategy = sysctl_jiffies,
3245 }, 3240 },
3246 { 3241 {
3247 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3248 .procname = "min_pmtu", 3242 .procname = "min_pmtu",
3249 .data = &ip_rt_min_pmtu, 3243 .data = &ip_rt_min_pmtu,
3250 .maxlen = sizeof(int), 3244 .maxlen = sizeof(int),
@@ -3252,7 +3246,6 @@ static ctl_table ipv4_route_table[] = {
3252 .proc_handler = proc_dointvec, 3246 .proc_handler = proc_dointvec,
3253 }, 3247 },
3254 { 3248 {
3255 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3256 .procname = "min_adv_mss", 3249 .procname = "min_adv_mss",
3257 .data = &ip_rt_min_advmss, 3250 .data = &ip_rt_min_advmss,
3258 .maxlen = sizeof(int), 3251 .maxlen = sizeof(int),
@@ -3260,50 +3253,46 @@ static ctl_table ipv4_route_table[] = {
3260 .proc_handler = proc_dointvec, 3253 .proc_handler = proc_dointvec,
3261 }, 3254 },
3262 { 3255 {
3263 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3264 .procname = "secret_interval", 3256 .procname = "secret_interval",
3265 .data = &ip_rt_secret_interval, 3257 .data = &ip_rt_secret_interval,
3266 .maxlen = sizeof(int), 3258 .maxlen = sizeof(int),
3267 .mode = 0644, 3259 .mode = 0644,
3268 .proc_handler = ipv4_sysctl_rt_secret_interval, 3260 .proc_handler = ipv4_sysctl_rt_secret_interval,
3269 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
3270 }, 3261 },
3271 { .ctl_name = 0 } 3262 { }
3272}; 3263};
3273 3264
3274static struct ctl_table empty[1]; 3265static struct ctl_table empty[1];
3275 3266
3276static struct ctl_table ipv4_skeleton[] = 3267static struct ctl_table ipv4_skeleton[] =
3277{ 3268{
3278 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, 3269 { .procname = "route",
3279 .mode = 0555, .child = ipv4_route_table}, 3270 .mode = 0555, .child = ipv4_route_table},
3280 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, 3271 { .procname = "neigh",
3281 .mode = 0555, .child = empty}, 3272 .mode = 0555, .child = empty},
3282 { } 3273 { }
3283}; 3274};
3284 3275
3285static __net_initdata struct ctl_path ipv4_path[] = { 3276static __net_initdata struct ctl_path ipv4_path[] = {
3286 { .procname = "net", .ctl_name = CTL_NET, }, 3277 { .procname = "net", },
3287 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 3278 { .procname = "ipv4", },
3288 { }, 3279 { },
3289}; 3280};
3290 3281
3291static struct ctl_table ipv4_route_flush_table[] = { 3282static struct ctl_table ipv4_route_flush_table[] = {
3292 { 3283 {
3293 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3294 .procname = "flush", 3284 .procname = "flush",
3295 .maxlen = sizeof(int), 3285 .maxlen = sizeof(int),
3296 .mode = 0200, 3286 .mode = 0200,
3297 .proc_handler = ipv4_sysctl_rtcache_flush, 3287 .proc_handler = ipv4_sysctl_rtcache_flush,
3298 .strategy = ipv4_sysctl_rtcache_flush_strategy,
3299 }, 3288 },
3300 { .ctl_name = 0 }, 3289 { },
3301}; 3290};
3302 3291
3303static __net_initdata struct ctl_path ipv4_route_path[] = { 3292static __net_initdata struct ctl_path ipv4_route_path[] = {
3304 { .procname = "net", .ctl_name = CTL_NET, }, 3293 { .procname = "net", },
3305 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 3294 { .procname = "ipv4", },
3306 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, 3295 { .procname = "route", },
3307 { }, 3296 { },
3308}; 3297};
3309 3298
@@ -3312,7 +3301,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
3312 struct ctl_table *tbl; 3301 struct ctl_table *tbl;
3313 3302
3314 tbl = ipv4_route_flush_table; 3303 tbl = ipv4_route_flush_table;
3315 if (net != &init_net) { 3304 if (!net_eq(net, &init_net)) {
3316 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3317 if (tbl == NULL) 3306 if (tbl == NULL)
3318 goto err_dup; 3307 goto err_dup;
@@ -3380,7 +3369,7 @@ static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3380 3369
3381 3370
3382#ifdef CONFIG_NET_CLS_ROUTE 3371#ifdef CONFIG_NET_CLS_ROUTE
3383struct ip_rt_acct *ip_rt_acct __read_mostly; 3372struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3384#endif /* CONFIG_NET_CLS_ROUTE */ 3373#endif /* CONFIG_NET_CLS_ROUTE */
3385 3374
3386static __initdata unsigned long rhash_entries; 3375static __initdata unsigned long rhash_entries;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e077ac33..5c24db4a3c91 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -253,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp);
253struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 253struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
254 struct ip_options *opt) 254 struct ip_options *opt)
255{ 255{
256 struct tcp_options_received tcp_opt;
257 u8 *hash_location;
256 struct inet_request_sock *ireq; 258 struct inet_request_sock *ireq;
257 struct tcp_request_sock *treq; 259 struct tcp_request_sock *treq;
258 struct tcp_sock *tp = tcp_sk(sk); 260 struct tcp_sock *tp = tcp_sk(sk);
@@ -263,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
263 int mss; 265 int mss;
264 struct rtable *rt; 266 struct rtable *rt;
265 __u8 rcv_wscale; 267 __u8 rcv_wscale;
266 struct tcp_options_received tcp_opt;
267 268
268 if (!sysctl_tcp_syncookies || !th->ack) 269 if (!sysctl_tcp_syncookies || !th->ack)
269 goto out; 270 goto out;
@@ -278,7 +279,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
278 279
279 /* check for timestamp cookie support */ 280 /* check for timestamp cookie support */
280 memset(&tcp_opt, 0, sizeof(tcp_opt)); 281 memset(&tcp_opt, 0, sizeof(tcp_opt));
281 tcp_parse_options(skb, &tcp_opt, 0); 282 tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
282 283
283 if (tcp_opt.saw_tstamp) 284 if (tcp_opt.saw_tstamp)
284 cookie_check_timestamp(&tcp_opt); 285 cookie_check_timestamp(&tcp_opt);
@@ -333,7 +334,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
333 * no easy way to do this. 334 * no easy way to do this.
334 */ 335 */
335 { 336 {
336 struct flowi fl = { .nl_u = { .ip4_u = 337 struct flowi fl = { .mark = sk->sk_mark,
338 .nl_u = { .ip4_u =
337 { .daddr = ((opt && opt->srr) ? 339 { .daddr = ((opt && opt->srr) ?
338 opt->faddr : 340 opt->faddr :
339 ireq->rmt_addr), 341 ireq->rmt_addr),
@@ -356,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
356 358
357 tcp_select_initial_window(tcp_full_space(sk), req->mss, 359 tcp_select_initial_window(tcp_full_space(sk), req->mss,
358 &req->rcv_wnd, &req->window_clamp, 360 &req->rcv_wnd, &req->window_clamp,
359 ireq->wscale_ok, &rcv_wscale); 361 ireq->wscale_ok, &rcv_wscale,
362 dst_metric(&rt->u.dst, RTAX_INITRWND));
360 363
361 ireq->rcv_wscale = rcv_wscale; 364 ireq->rcv_wscale = rcv_wscale;
362 365
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2dcf04d9b005..1cd5c15174b8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
12#include <linux/inetdevice.h> 12#include <linux/inetdevice.h>
13#include <linux/seqlock.h> 13#include <linux/seqlock.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
15#include <net/snmp.h> 16#include <net/snmp.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -63,34 +64,6 @@ static int ipv4_local_port_range(ctl_table *table, int write,
63 return ret; 64 return ret;
64} 65}
65 66
66/* Validate changes from sysctl interface. */
67static int ipv4_sysctl_local_port_range(ctl_table *table,
68 void __user *oldval,
69 size_t __user *oldlenp,
70 void __user *newval, size_t newlen)
71{
72 int ret;
73 int range[2];
74 ctl_table tmp = {
75 .data = &range,
76 .maxlen = sizeof(range),
77 .mode = table->mode,
78 .extra1 = &ip_local_port_range_min,
79 .extra2 = &ip_local_port_range_max,
80 };
81
82 inet_get_local_port_range(range, range + 1);
83 ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen);
84 if (ret == 0 && newval && newlen) {
85 if (range[1] < range[0])
86 ret = -EINVAL;
87 else
88 set_local_port_range(range);
89 }
90 return ret;
91}
92
93
94static int proc_tcp_congestion_control(ctl_table *ctl, int write, 67static int proc_tcp_congestion_control(ctl_table *ctl, int write,
95 void __user *buffer, size_t *lenp, loff_t *ppos) 68 void __user *buffer, size_t *lenp, loff_t *ppos)
96{ 69{
@@ -109,25 +82,6 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write,
109 return ret; 82 return ret;
110} 83}
111 84
112static int sysctl_tcp_congestion_control(ctl_table *table,
113 void __user *oldval,
114 size_t __user *oldlenp,
115 void __user *newval, size_t newlen)
116{
117 char val[TCP_CA_NAME_MAX];
118 ctl_table tbl = {
119 .data = val,
120 .maxlen = TCP_CA_NAME_MAX,
121 };
122 int ret;
123
124 tcp_get_default_congestion_control(val);
125 ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
126 if (ret == 1 && newval && newlen)
127 ret = tcp_set_default_congestion_control(val);
128 return ret;
129}
130
131static int proc_tcp_available_congestion_control(ctl_table *ctl, 85static int proc_tcp_available_congestion_control(ctl_table *ctl,
132 int write, 86 int write,
133 void __user *buffer, size_t *lenp, 87 void __user *buffer, size_t *lenp,
@@ -165,32 +119,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
165 return ret; 119 return ret;
166} 120}
167 121
168static int strategy_allowed_congestion_control(ctl_table *table,
169 void __user *oldval,
170 size_t __user *oldlenp,
171 void __user *newval,
172 size_t newlen)
173{
174 ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
175 int ret;
176
177 tbl.data = kmalloc(tbl.maxlen, GFP_USER);
178 if (!tbl.data)
179 return -ENOMEM;
180
181 tcp_get_available_congestion_control(tbl.data, tbl.maxlen);
182 ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
183 if (ret == 1 && newval && newlen)
184 ret = tcp_set_allowed_congestion_control(tbl.data);
185 kfree(tbl.data);
186
187 return ret;
188
189}
190
191static struct ctl_table ipv4_table[] = { 122static struct ctl_table ipv4_table[] = {
192 { 123 {
193 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
194 .procname = "tcp_timestamps", 124 .procname = "tcp_timestamps",
195 .data = &sysctl_tcp_timestamps, 125 .data = &sysctl_tcp_timestamps,
196 .maxlen = sizeof(int), 126 .maxlen = sizeof(int),
@@ -198,7 +128,6 @@ static struct ctl_table ipv4_table[] = {
198 .proc_handler = proc_dointvec 128 .proc_handler = proc_dointvec
199 }, 129 },
200 { 130 {
201 .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
202 .procname = "tcp_window_scaling", 131 .procname = "tcp_window_scaling",
203 .data = &sysctl_tcp_window_scaling, 132 .data = &sysctl_tcp_window_scaling,
204 .maxlen = sizeof(int), 133 .maxlen = sizeof(int),
@@ -206,7 +135,6 @@ static struct ctl_table ipv4_table[] = {
206 .proc_handler = proc_dointvec 135 .proc_handler = proc_dointvec
207 }, 136 },
208 { 137 {
209 .ctl_name = NET_IPV4_TCP_SACK,
210 .procname = "tcp_sack", 138 .procname = "tcp_sack",
211 .data = &sysctl_tcp_sack, 139 .data = &sysctl_tcp_sack,
212 .maxlen = sizeof(int), 140 .maxlen = sizeof(int),
@@ -214,7 +142,6 @@ static struct ctl_table ipv4_table[] = {
214 .proc_handler = proc_dointvec 142 .proc_handler = proc_dointvec
215 }, 143 },
216 { 144 {
217 .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
218 .procname = "tcp_retrans_collapse", 145 .procname = "tcp_retrans_collapse",
219 .data = &sysctl_tcp_retrans_collapse, 146 .data = &sysctl_tcp_retrans_collapse,
220 .maxlen = sizeof(int), 147 .maxlen = sizeof(int),
@@ -222,17 +149,14 @@ static struct ctl_table ipv4_table[] = {
222 .proc_handler = proc_dointvec 149 .proc_handler = proc_dointvec
223 }, 150 },
224 { 151 {
225 .ctl_name = NET_IPV4_DEFAULT_TTL,
226 .procname = "ip_default_ttl", 152 .procname = "ip_default_ttl",
227 .data = &sysctl_ip_default_ttl, 153 .data = &sysctl_ip_default_ttl,
228 .maxlen = sizeof(int), 154 .maxlen = sizeof(int),
229 .mode = 0644, 155 .mode = 0644,
230 .proc_handler = ipv4_doint_and_flush, 156 .proc_handler = ipv4_doint_and_flush,
231 .strategy = ipv4_doint_and_flush_strategy,
232 .extra2 = &init_net, 157 .extra2 = &init_net,
233 }, 158 },
234 { 159 {
235 .ctl_name = NET_IPV4_NO_PMTU_DISC,
236 .procname = "ip_no_pmtu_disc", 160 .procname = "ip_no_pmtu_disc",
237 .data = &ipv4_config.no_pmtu_disc, 161 .data = &ipv4_config.no_pmtu_disc,
238 .maxlen = sizeof(int), 162 .maxlen = sizeof(int),
@@ -240,7 +164,6 @@ static struct ctl_table ipv4_table[] = {
240 .proc_handler = proc_dointvec 164 .proc_handler = proc_dointvec
241 }, 165 },
242 { 166 {
243 .ctl_name = NET_IPV4_NONLOCAL_BIND,
244 .procname = "ip_nonlocal_bind", 167 .procname = "ip_nonlocal_bind",
245 .data = &sysctl_ip_nonlocal_bind, 168 .data = &sysctl_ip_nonlocal_bind,
246 .maxlen = sizeof(int), 169 .maxlen = sizeof(int),
@@ -248,7 +171,6 @@ static struct ctl_table ipv4_table[] = {
248 .proc_handler = proc_dointvec 171 .proc_handler = proc_dointvec
249 }, 172 },
250 { 173 {
251 .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
252 .procname = "tcp_syn_retries", 174 .procname = "tcp_syn_retries",
253 .data = &sysctl_tcp_syn_retries, 175 .data = &sysctl_tcp_syn_retries,
254 .maxlen = sizeof(int), 176 .maxlen = sizeof(int),
@@ -256,7 +178,6 @@ static struct ctl_table ipv4_table[] = {
256 .proc_handler = proc_dointvec 178 .proc_handler = proc_dointvec
257 }, 179 },
258 { 180 {
259 .ctl_name = NET_TCP_SYNACK_RETRIES,
260 .procname = "tcp_synack_retries", 181 .procname = "tcp_synack_retries",
261 .data = &sysctl_tcp_synack_retries, 182 .data = &sysctl_tcp_synack_retries,
262 .maxlen = sizeof(int), 183 .maxlen = sizeof(int),
@@ -264,7 +185,6 @@ static struct ctl_table ipv4_table[] = {
264 .proc_handler = proc_dointvec 185 .proc_handler = proc_dointvec
265 }, 186 },
266 { 187 {
267 .ctl_name = NET_TCP_MAX_ORPHANS,
268 .procname = "tcp_max_orphans", 188 .procname = "tcp_max_orphans",
269 .data = &sysctl_tcp_max_orphans, 189 .data = &sysctl_tcp_max_orphans,
270 .maxlen = sizeof(int), 190 .maxlen = sizeof(int),
@@ -272,7 +192,6 @@ static struct ctl_table ipv4_table[] = {
272 .proc_handler = proc_dointvec 192 .proc_handler = proc_dointvec
273 }, 193 },
274 { 194 {
275 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
276 .procname = "tcp_max_tw_buckets", 195 .procname = "tcp_max_tw_buckets",
277 .data = &tcp_death_row.sysctl_max_tw_buckets, 196 .data = &tcp_death_row.sysctl_max_tw_buckets,
278 .maxlen = sizeof(int), 197 .maxlen = sizeof(int),
@@ -280,7 +199,6 @@ static struct ctl_table ipv4_table[] = {
280 .proc_handler = proc_dointvec 199 .proc_handler = proc_dointvec
281 }, 200 },
282 { 201 {
283 .ctl_name = NET_IPV4_DYNADDR,
284 .procname = "ip_dynaddr", 202 .procname = "ip_dynaddr",
285 .data = &sysctl_ip_dynaddr, 203 .data = &sysctl_ip_dynaddr,
286 .maxlen = sizeof(int), 204 .maxlen = sizeof(int),
@@ -288,16 +206,13 @@ static struct ctl_table ipv4_table[] = {
288 .proc_handler = proc_dointvec 206 .proc_handler = proc_dointvec
289 }, 207 },
290 { 208 {
291 .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
292 .procname = "tcp_keepalive_time", 209 .procname = "tcp_keepalive_time",
293 .data = &sysctl_tcp_keepalive_time, 210 .data = &sysctl_tcp_keepalive_time,
294 .maxlen = sizeof(int), 211 .maxlen = sizeof(int),
295 .mode = 0644, 212 .mode = 0644,
296 .proc_handler = proc_dointvec_jiffies, 213 .proc_handler = proc_dointvec_jiffies,
297 .strategy = sysctl_jiffies
298 }, 214 },
299 { 215 {
300 .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
301 .procname = "tcp_keepalive_probes", 216 .procname = "tcp_keepalive_probes",
302 .data = &sysctl_tcp_keepalive_probes, 217 .data = &sysctl_tcp_keepalive_probes,
303 .maxlen = sizeof(int), 218 .maxlen = sizeof(int),
@@ -305,26 +220,21 @@ static struct ctl_table ipv4_table[] = {
305 .proc_handler = proc_dointvec 220 .proc_handler = proc_dointvec
306 }, 221 },
307 { 222 {
308 .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
309 .procname = "tcp_keepalive_intvl", 223 .procname = "tcp_keepalive_intvl",
310 .data = &sysctl_tcp_keepalive_intvl, 224 .data = &sysctl_tcp_keepalive_intvl,
311 .maxlen = sizeof(int), 225 .maxlen = sizeof(int),
312 .mode = 0644, 226 .mode = 0644,
313 .proc_handler = proc_dointvec_jiffies, 227 .proc_handler = proc_dointvec_jiffies,
314 .strategy = sysctl_jiffies
315 }, 228 },
316 { 229 {
317 .ctl_name = NET_IPV4_TCP_RETRIES1,
318 .procname = "tcp_retries1", 230 .procname = "tcp_retries1",
319 .data = &sysctl_tcp_retries1, 231 .data = &sysctl_tcp_retries1,
320 .maxlen = sizeof(int), 232 .maxlen = sizeof(int),
321 .mode = 0644, 233 .mode = 0644,
322 .proc_handler = proc_dointvec_minmax, 234 .proc_handler = proc_dointvec_minmax,
323 .strategy = sysctl_intvec,
324 .extra2 = &tcp_retr1_max 235 .extra2 = &tcp_retr1_max
325 }, 236 },
326 { 237 {
327 .ctl_name = NET_IPV4_TCP_RETRIES2,
328 .procname = "tcp_retries2", 238 .procname = "tcp_retries2",
329 .data = &sysctl_tcp_retries2, 239 .data = &sysctl_tcp_retries2,
330 .maxlen = sizeof(int), 240 .maxlen = sizeof(int),
@@ -332,17 +242,14 @@ static struct ctl_table ipv4_table[] = {
332 .proc_handler = proc_dointvec 242 .proc_handler = proc_dointvec
333 }, 243 },
334 { 244 {
335 .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
336 .procname = "tcp_fin_timeout", 245 .procname = "tcp_fin_timeout",
337 .data = &sysctl_tcp_fin_timeout, 246 .data = &sysctl_tcp_fin_timeout,
338 .maxlen = sizeof(int), 247 .maxlen = sizeof(int),
339 .mode = 0644, 248 .mode = 0644,
340 .proc_handler = proc_dointvec_jiffies, 249 .proc_handler = proc_dointvec_jiffies,
341 .strategy = sysctl_jiffies
342 }, 250 },
343#ifdef CONFIG_SYN_COOKIES 251#ifdef CONFIG_SYN_COOKIES
344 { 252 {
345 .ctl_name = NET_TCP_SYNCOOKIES,
346 .procname = "tcp_syncookies", 253 .procname = "tcp_syncookies",
347 .data = &sysctl_tcp_syncookies, 254 .data = &sysctl_tcp_syncookies,
348 .maxlen = sizeof(int), 255 .maxlen = sizeof(int),
@@ -351,7 +258,6 @@ static struct ctl_table ipv4_table[] = {
351 }, 258 },
352#endif 259#endif
353 { 260 {
354 .ctl_name = NET_TCP_TW_RECYCLE,
355 .procname = "tcp_tw_recycle", 261 .procname = "tcp_tw_recycle",
356 .data = &tcp_death_row.sysctl_tw_recycle, 262 .data = &tcp_death_row.sysctl_tw_recycle,
357 .maxlen = sizeof(int), 263 .maxlen = sizeof(int),
@@ -359,7 +265,6 @@ static struct ctl_table ipv4_table[] = {
359 .proc_handler = proc_dointvec 265 .proc_handler = proc_dointvec
360 }, 266 },
361 { 267 {
362 .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
363 .procname = "tcp_abort_on_overflow", 268 .procname = "tcp_abort_on_overflow",
364 .data = &sysctl_tcp_abort_on_overflow, 269 .data = &sysctl_tcp_abort_on_overflow,
365 .maxlen = sizeof(int), 270 .maxlen = sizeof(int),
@@ -367,7 +272,6 @@ static struct ctl_table ipv4_table[] = {
367 .proc_handler = proc_dointvec 272 .proc_handler = proc_dointvec
368 }, 273 },
369 { 274 {
370 .ctl_name = NET_TCP_STDURG,
371 .procname = "tcp_stdurg", 275 .procname = "tcp_stdurg",
372 .data = &sysctl_tcp_stdurg, 276 .data = &sysctl_tcp_stdurg,
373 .maxlen = sizeof(int), 277 .maxlen = sizeof(int),
@@ -375,7 +279,6 @@ static struct ctl_table ipv4_table[] = {
375 .proc_handler = proc_dointvec 279 .proc_handler = proc_dointvec
376 }, 280 },
377 { 281 {
378 .ctl_name = NET_TCP_RFC1337,
379 .procname = "tcp_rfc1337", 282 .procname = "tcp_rfc1337",
380 .data = &sysctl_tcp_rfc1337, 283 .data = &sysctl_tcp_rfc1337,
381 .maxlen = sizeof(int), 284 .maxlen = sizeof(int),
@@ -383,7 +286,6 @@ static struct ctl_table ipv4_table[] = {
383 .proc_handler = proc_dointvec 286 .proc_handler = proc_dointvec
384 }, 287 },
385 { 288 {
386 .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
387 .procname = "tcp_max_syn_backlog", 289 .procname = "tcp_max_syn_backlog",
388 .data = &sysctl_max_syn_backlog, 290 .data = &sysctl_max_syn_backlog,
389 .maxlen = sizeof(int), 291 .maxlen = sizeof(int),
@@ -391,17 +293,14 @@ static struct ctl_table ipv4_table[] = {
391 .proc_handler = proc_dointvec 293 .proc_handler = proc_dointvec
392 }, 294 },
393 { 295 {
394 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
395 .procname = "ip_local_port_range", 296 .procname = "ip_local_port_range",
396 .data = &sysctl_local_ports.range, 297 .data = &sysctl_local_ports.range,
397 .maxlen = sizeof(sysctl_local_ports.range), 298 .maxlen = sizeof(sysctl_local_ports.range),
398 .mode = 0644, 299 .mode = 0644,
399 .proc_handler = ipv4_local_port_range, 300 .proc_handler = ipv4_local_port_range,
400 .strategy = ipv4_sysctl_local_port_range,
401 }, 301 },
402#ifdef CONFIG_IP_MULTICAST 302#ifdef CONFIG_IP_MULTICAST
403 { 303 {
404 .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
405 .procname = "igmp_max_memberships", 304 .procname = "igmp_max_memberships",
406 .data = &sysctl_igmp_max_memberships, 305 .data = &sysctl_igmp_max_memberships,
407 .maxlen = sizeof(int), 306 .maxlen = sizeof(int),
@@ -411,7 +310,6 @@ static struct ctl_table ipv4_table[] = {
411 310
412#endif 311#endif
413 { 312 {
414 .ctl_name = NET_IPV4_IGMP_MAX_MSF,
415 .procname = "igmp_max_msf", 313 .procname = "igmp_max_msf",
416 .data = &sysctl_igmp_max_msf, 314 .data = &sysctl_igmp_max_msf,
417 .maxlen = sizeof(int), 315 .maxlen = sizeof(int),
@@ -419,7 +317,6 @@ static struct ctl_table ipv4_table[] = {
419 .proc_handler = proc_dointvec 317 .proc_handler = proc_dointvec
420 }, 318 },
421 { 319 {
422 .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
423 .procname = "inet_peer_threshold", 320 .procname = "inet_peer_threshold",
424 .data = &inet_peer_threshold, 321 .data = &inet_peer_threshold,
425 .maxlen = sizeof(int), 322 .maxlen = sizeof(int),
@@ -427,43 +324,34 @@ static struct ctl_table ipv4_table[] = {
427 .proc_handler = proc_dointvec 324 .proc_handler = proc_dointvec
428 }, 325 },
429 { 326 {
430 .ctl_name = NET_IPV4_INET_PEER_MINTTL,
431 .procname = "inet_peer_minttl", 327 .procname = "inet_peer_minttl",
432 .data = &inet_peer_minttl, 328 .data = &inet_peer_minttl,
433 .maxlen = sizeof(int), 329 .maxlen = sizeof(int),
434 .mode = 0644, 330 .mode = 0644,
435 .proc_handler = proc_dointvec_jiffies, 331 .proc_handler = proc_dointvec_jiffies,
436 .strategy = sysctl_jiffies
437 }, 332 },
438 { 333 {
439 .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
440 .procname = "inet_peer_maxttl", 334 .procname = "inet_peer_maxttl",
441 .data = &inet_peer_maxttl, 335 .data = &inet_peer_maxttl,
442 .maxlen = sizeof(int), 336 .maxlen = sizeof(int),
443 .mode = 0644, 337 .mode = 0644,
444 .proc_handler = proc_dointvec_jiffies, 338 .proc_handler = proc_dointvec_jiffies,
445 .strategy = sysctl_jiffies
446 }, 339 },
447 { 340 {
448 .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
449 .procname = "inet_peer_gc_mintime", 341 .procname = "inet_peer_gc_mintime",
450 .data = &inet_peer_gc_mintime, 342 .data = &inet_peer_gc_mintime,
451 .maxlen = sizeof(int), 343 .maxlen = sizeof(int),
452 .mode = 0644, 344 .mode = 0644,
453 .proc_handler = proc_dointvec_jiffies, 345 .proc_handler = proc_dointvec_jiffies,
454 .strategy = sysctl_jiffies
455 }, 346 },
456 { 347 {
457 .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
458 .procname = "inet_peer_gc_maxtime", 348 .procname = "inet_peer_gc_maxtime",
459 .data = &inet_peer_gc_maxtime, 349 .data = &inet_peer_gc_maxtime,
460 .maxlen = sizeof(int), 350 .maxlen = sizeof(int),
461 .mode = 0644, 351 .mode = 0644,
462 .proc_handler = proc_dointvec_jiffies, 352 .proc_handler = proc_dointvec_jiffies,
463 .strategy = sysctl_jiffies
464 }, 353 },
465 { 354 {
466 .ctl_name = NET_TCP_ORPHAN_RETRIES,
467 .procname = "tcp_orphan_retries", 355 .procname = "tcp_orphan_retries",
468 .data = &sysctl_tcp_orphan_retries, 356 .data = &sysctl_tcp_orphan_retries,
469 .maxlen = sizeof(int), 357 .maxlen = sizeof(int),
@@ -471,7 +359,6 @@ static struct ctl_table ipv4_table[] = {
471 .proc_handler = proc_dointvec 359 .proc_handler = proc_dointvec
472 }, 360 },
473 { 361 {
474 .ctl_name = NET_TCP_FACK,
475 .procname = "tcp_fack", 362 .procname = "tcp_fack",
476 .data = &sysctl_tcp_fack, 363 .data = &sysctl_tcp_fack,
477 .maxlen = sizeof(int), 364 .maxlen = sizeof(int),
@@ -479,7 +366,6 @@ static struct ctl_table ipv4_table[] = {
479 .proc_handler = proc_dointvec 366 .proc_handler = proc_dointvec
480 }, 367 },
481 { 368 {
482 .ctl_name = NET_TCP_REORDERING,
483 .procname = "tcp_reordering", 369 .procname = "tcp_reordering",
484 .data = &sysctl_tcp_reordering, 370 .data = &sysctl_tcp_reordering,
485 .maxlen = sizeof(int), 371 .maxlen = sizeof(int),
@@ -487,7 +373,6 @@ static struct ctl_table ipv4_table[] = {
487 .proc_handler = proc_dointvec 373 .proc_handler = proc_dointvec
488 }, 374 },
489 { 375 {
490 .ctl_name = NET_TCP_ECN,
491 .procname = "tcp_ecn", 376 .procname = "tcp_ecn",
492 .data = &sysctl_tcp_ecn, 377 .data = &sysctl_tcp_ecn,
493 .maxlen = sizeof(int), 378 .maxlen = sizeof(int),
@@ -495,7 +380,6 @@ static struct ctl_table ipv4_table[] = {
495 .proc_handler = proc_dointvec 380 .proc_handler = proc_dointvec
496 }, 381 },
497 { 382 {
498 .ctl_name = NET_TCP_DSACK,
499 .procname = "tcp_dsack", 383 .procname = "tcp_dsack",
500 .data = &sysctl_tcp_dsack, 384 .data = &sysctl_tcp_dsack,
501 .maxlen = sizeof(int), 385 .maxlen = sizeof(int),
@@ -503,7 +387,6 @@ static struct ctl_table ipv4_table[] = {
503 .proc_handler = proc_dointvec 387 .proc_handler = proc_dointvec
504 }, 388 },
505 { 389 {
506 .ctl_name = NET_TCP_MEM,
507 .procname = "tcp_mem", 390 .procname = "tcp_mem",
508 .data = &sysctl_tcp_mem, 391 .data = &sysctl_tcp_mem,
509 .maxlen = sizeof(sysctl_tcp_mem), 392 .maxlen = sizeof(sysctl_tcp_mem),
@@ -511,7 +394,6 @@ static struct ctl_table ipv4_table[] = {
511 .proc_handler = proc_dointvec 394 .proc_handler = proc_dointvec
512 }, 395 },
513 { 396 {
514 .ctl_name = NET_TCP_WMEM,
515 .procname = "tcp_wmem", 397 .procname = "tcp_wmem",
516 .data = &sysctl_tcp_wmem, 398 .data = &sysctl_tcp_wmem,
517 .maxlen = sizeof(sysctl_tcp_wmem), 399 .maxlen = sizeof(sysctl_tcp_wmem),
@@ -519,7 +401,6 @@ static struct ctl_table ipv4_table[] = {
519 .proc_handler = proc_dointvec 401 .proc_handler = proc_dointvec
520 }, 402 },
521 { 403 {
522 .ctl_name = NET_TCP_RMEM,
523 .procname = "tcp_rmem", 404 .procname = "tcp_rmem",
524 .data = &sysctl_tcp_rmem, 405 .data = &sysctl_tcp_rmem,
525 .maxlen = sizeof(sysctl_tcp_rmem), 406 .maxlen = sizeof(sysctl_tcp_rmem),
@@ -527,7 +408,6 @@ static struct ctl_table ipv4_table[] = {
527 .proc_handler = proc_dointvec 408 .proc_handler = proc_dointvec
528 }, 409 },
529 { 410 {
530 .ctl_name = NET_TCP_APP_WIN,
531 .procname = "tcp_app_win", 411 .procname = "tcp_app_win",
532 .data = &sysctl_tcp_app_win, 412 .data = &sysctl_tcp_app_win,
533 .maxlen = sizeof(int), 413 .maxlen = sizeof(int),
@@ -535,7 +415,6 @@ static struct ctl_table ipv4_table[] = {
535 .proc_handler = proc_dointvec 415 .proc_handler = proc_dointvec
536 }, 416 },
537 { 417 {
538 .ctl_name = NET_TCP_ADV_WIN_SCALE,
539 .procname = "tcp_adv_win_scale", 418 .procname = "tcp_adv_win_scale",
540 .data = &sysctl_tcp_adv_win_scale, 419 .data = &sysctl_tcp_adv_win_scale,
541 .maxlen = sizeof(int), 420 .maxlen = sizeof(int),
@@ -543,7 +422,6 @@ static struct ctl_table ipv4_table[] = {
543 .proc_handler = proc_dointvec 422 .proc_handler = proc_dointvec
544 }, 423 },
545 { 424 {
546 .ctl_name = NET_TCP_TW_REUSE,
547 .procname = "tcp_tw_reuse", 425 .procname = "tcp_tw_reuse",
548 .data = &sysctl_tcp_tw_reuse, 426 .data = &sysctl_tcp_tw_reuse,
549 .maxlen = sizeof(int), 427 .maxlen = sizeof(int),
@@ -551,7 +429,6 @@ static struct ctl_table ipv4_table[] = {
551 .proc_handler = proc_dointvec 429 .proc_handler = proc_dointvec
552 }, 430 },
553 { 431 {
554 .ctl_name = NET_TCP_FRTO,
555 .procname = "tcp_frto", 432 .procname = "tcp_frto",
556 .data = &sysctl_tcp_frto, 433 .data = &sysctl_tcp_frto,
557 .maxlen = sizeof(int), 434 .maxlen = sizeof(int),
@@ -559,7 +436,6 @@ static struct ctl_table ipv4_table[] = {
559 .proc_handler = proc_dointvec 436 .proc_handler = proc_dointvec
560 }, 437 },
561 { 438 {
562 .ctl_name = NET_TCP_FRTO_RESPONSE,
563 .procname = "tcp_frto_response", 439 .procname = "tcp_frto_response",
564 .data = &sysctl_tcp_frto_response, 440 .data = &sysctl_tcp_frto_response,
565 .maxlen = sizeof(int), 441 .maxlen = sizeof(int),
@@ -567,7 +443,6 @@ static struct ctl_table ipv4_table[] = {
567 .proc_handler = proc_dointvec 443 .proc_handler = proc_dointvec
568 }, 444 },
569 { 445 {
570 .ctl_name = NET_TCP_LOW_LATENCY,
571 .procname = "tcp_low_latency", 446 .procname = "tcp_low_latency",
572 .data = &sysctl_tcp_low_latency, 447 .data = &sysctl_tcp_low_latency,
573 .maxlen = sizeof(int), 448 .maxlen = sizeof(int),
@@ -575,7 +450,6 @@ static struct ctl_table ipv4_table[] = {
575 .proc_handler = proc_dointvec 450 .proc_handler = proc_dointvec
576 }, 451 },
577 { 452 {
578 .ctl_name = NET_TCP_NO_METRICS_SAVE,
579 .procname = "tcp_no_metrics_save", 453 .procname = "tcp_no_metrics_save",
580 .data = &sysctl_tcp_nometrics_save, 454 .data = &sysctl_tcp_nometrics_save,
581 .maxlen = sizeof(int), 455 .maxlen = sizeof(int),
@@ -583,7 +457,6 @@ static struct ctl_table ipv4_table[] = {
583 .proc_handler = proc_dointvec, 457 .proc_handler = proc_dointvec,
584 }, 458 },
585 { 459 {
586 .ctl_name = NET_TCP_MODERATE_RCVBUF,
587 .procname = "tcp_moderate_rcvbuf", 460 .procname = "tcp_moderate_rcvbuf",
588 .data = &sysctl_tcp_moderate_rcvbuf, 461 .data = &sysctl_tcp_moderate_rcvbuf,
589 .maxlen = sizeof(int), 462 .maxlen = sizeof(int),
@@ -591,7 +464,6 @@ static struct ctl_table ipv4_table[] = {
591 .proc_handler = proc_dointvec, 464 .proc_handler = proc_dointvec,
592 }, 465 },
593 { 466 {
594 .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
595 .procname = "tcp_tso_win_divisor", 467 .procname = "tcp_tso_win_divisor",
596 .data = &sysctl_tcp_tso_win_divisor, 468 .data = &sysctl_tcp_tso_win_divisor,
597 .maxlen = sizeof(int), 469 .maxlen = sizeof(int),
@@ -599,15 +471,12 @@ static struct ctl_table ipv4_table[] = {
599 .proc_handler = proc_dointvec, 471 .proc_handler = proc_dointvec,
600 }, 472 },
601 { 473 {
602 .ctl_name = NET_TCP_CONG_CONTROL,
603 .procname = "tcp_congestion_control", 474 .procname = "tcp_congestion_control",
604 .mode = 0644, 475 .mode = 0644,
605 .maxlen = TCP_CA_NAME_MAX, 476 .maxlen = TCP_CA_NAME_MAX,
606 .proc_handler = proc_tcp_congestion_control, 477 .proc_handler = proc_tcp_congestion_control,
607 .strategy = sysctl_tcp_congestion_control,
608 }, 478 },
609 { 479 {
610 .ctl_name = NET_TCP_ABC,
611 .procname = "tcp_abc", 480 .procname = "tcp_abc",
612 .data = &sysctl_tcp_abc, 481 .data = &sysctl_tcp_abc,
613 .maxlen = sizeof(int), 482 .maxlen = sizeof(int),
@@ -615,7 +484,6 @@ static struct ctl_table ipv4_table[] = {
615 .proc_handler = proc_dointvec, 484 .proc_handler = proc_dointvec,
616 }, 485 },
617 { 486 {
618 .ctl_name = NET_TCP_MTU_PROBING,
619 .procname = "tcp_mtu_probing", 487 .procname = "tcp_mtu_probing",
620 .data = &sysctl_tcp_mtu_probing, 488 .data = &sysctl_tcp_mtu_probing,
621 .maxlen = sizeof(int), 489 .maxlen = sizeof(int),
@@ -623,7 +491,6 @@ static struct ctl_table ipv4_table[] = {
623 .proc_handler = proc_dointvec, 491 .proc_handler = proc_dointvec,
624 }, 492 },
625 { 493 {
626 .ctl_name = NET_TCP_BASE_MSS,
627 .procname = "tcp_base_mss", 494 .procname = "tcp_base_mss",
628 .data = &sysctl_tcp_base_mss, 495 .data = &sysctl_tcp_base_mss,
629 .maxlen = sizeof(int), 496 .maxlen = sizeof(int),
@@ -631,7 +498,6 @@ static struct ctl_table ipv4_table[] = {
631 .proc_handler = proc_dointvec, 498 .proc_handler = proc_dointvec,
632 }, 499 },
633 { 500 {
634 .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,
635 .procname = "tcp_workaround_signed_windows", 501 .procname = "tcp_workaround_signed_windows",
636 .data = &sysctl_tcp_workaround_signed_windows, 502 .data = &sysctl_tcp_workaround_signed_windows,
637 .maxlen = sizeof(int), 503 .maxlen = sizeof(int),
@@ -640,7 +506,6 @@ static struct ctl_table ipv4_table[] = {
640 }, 506 },
641#ifdef CONFIG_NET_DMA 507#ifdef CONFIG_NET_DMA
642 { 508 {
643 .ctl_name = NET_TCP_DMA_COPYBREAK,
644 .procname = "tcp_dma_copybreak", 509 .procname = "tcp_dma_copybreak",
645 .data = &sysctl_tcp_dma_copybreak, 510 .data = &sysctl_tcp_dma_copybreak,
646 .maxlen = sizeof(int), 511 .maxlen = sizeof(int),
@@ -649,7 +514,6 @@ static struct ctl_table ipv4_table[] = {
649 }, 514 },
650#endif 515#endif
651 { 516 {
652 .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE,
653 .procname = "tcp_slow_start_after_idle", 517 .procname = "tcp_slow_start_after_idle",
654 .data = &sysctl_tcp_slow_start_after_idle, 518 .data = &sysctl_tcp_slow_start_after_idle,
655 .maxlen = sizeof(int), 519 .maxlen = sizeof(int),
@@ -658,7 +522,6 @@ static struct ctl_table ipv4_table[] = {
658 }, 522 },
659#ifdef CONFIG_NETLABEL 523#ifdef CONFIG_NETLABEL
660 { 524 {
661 .ctl_name = NET_CIPSOV4_CACHE_ENABLE,
662 .procname = "cipso_cache_enable", 525 .procname = "cipso_cache_enable",
663 .data = &cipso_v4_cache_enabled, 526 .data = &cipso_v4_cache_enabled,
664 .maxlen = sizeof(int), 527 .maxlen = sizeof(int),
@@ -666,7 +529,6 @@ static struct ctl_table ipv4_table[] = {
666 .proc_handler = proc_dointvec, 529 .proc_handler = proc_dointvec,
667 }, 530 },
668 { 531 {
669 .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE,
670 .procname = "cipso_cache_bucket_size", 532 .procname = "cipso_cache_bucket_size",
671 .data = &cipso_v4_cache_bucketsize, 533 .data = &cipso_v4_cache_bucketsize,
672 .maxlen = sizeof(int), 534 .maxlen = sizeof(int),
@@ -674,7 +536,6 @@ static struct ctl_table ipv4_table[] = {
674 .proc_handler = proc_dointvec, 536 .proc_handler = proc_dointvec,
675 }, 537 },
676 { 538 {
677 .ctl_name = NET_CIPSOV4_RBM_OPTFMT,
678 .procname = "cipso_rbm_optfmt", 539 .procname = "cipso_rbm_optfmt",
679 .data = &cipso_v4_rbm_optfmt, 540 .data = &cipso_v4_rbm_optfmt,
680 .maxlen = sizeof(int), 541 .maxlen = sizeof(int),
@@ -682,7 +543,6 @@ static struct ctl_table ipv4_table[] = {
682 .proc_handler = proc_dointvec, 543 .proc_handler = proc_dointvec,
683 }, 544 },
684 { 545 {
685 .ctl_name = NET_CIPSOV4_RBM_STRICTVALID,
686 .procname = "cipso_rbm_strictvalid", 546 .procname = "cipso_rbm_strictvalid",
687 .data = &cipso_v4_rbm_strictvalid, 547 .data = &cipso_v4_rbm_strictvalid,
688 .maxlen = sizeof(int), 548 .maxlen = sizeof(int),
@@ -697,15 +557,12 @@ static struct ctl_table ipv4_table[] = {
697 .proc_handler = proc_tcp_available_congestion_control, 557 .proc_handler = proc_tcp_available_congestion_control,
698 }, 558 },
699 { 559 {
700 .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL,
701 .procname = "tcp_allowed_congestion_control", 560 .procname = "tcp_allowed_congestion_control",
702 .maxlen = TCP_CA_BUF_MAX, 561 .maxlen = TCP_CA_BUF_MAX,
703 .mode = 0644, 562 .mode = 0644,
704 .proc_handler = proc_allowed_congestion_control, 563 .proc_handler = proc_allowed_congestion_control,
705 .strategy = strategy_allowed_congestion_control,
706 }, 564 },
707 { 565 {
708 .ctl_name = NET_TCP_MAX_SSTHRESH,
709 .procname = "tcp_max_ssthresh", 566 .procname = "tcp_max_ssthresh",
710 .data = &sysctl_tcp_max_ssthresh, 567 .data = &sysctl_tcp_max_ssthresh,
711 .maxlen = sizeof(int), 568 .maxlen = sizeof(int),
@@ -713,41 +570,55 @@ static struct ctl_table ipv4_table[] = {
713 .proc_handler = proc_dointvec, 570 .proc_handler = proc_dointvec,
714 }, 571 },
715 { 572 {
716 .ctl_name = CTL_UNNUMBERED, 573 .procname = "tcp_cookie_size",
574 .data = &sysctl_tcp_cookie_size,
575 .maxlen = sizeof(int),
576 .mode = 0644,
577 .proc_handler = proc_dointvec
578 },
579 {
580 .procname = "tcp_thin_linear_timeouts",
581 .data = &sysctl_tcp_thin_linear_timeouts,
582 .maxlen = sizeof(int),
583 .mode = 0644,
584 .proc_handler = proc_dointvec
585 },
586 {
587 .procname = "tcp_thin_dupack",
588 .data = &sysctl_tcp_thin_dupack,
589 .maxlen = sizeof(int),
590 .mode = 0644,
591 .proc_handler = proc_dointvec
592 },
593 {
717 .procname = "udp_mem", 594 .procname = "udp_mem",
718 .data = &sysctl_udp_mem, 595 .data = &sysctl_udp_mem,
719 .maxlen = sizeof(sysctl_udp_mem), 596 .maxlen = sizeof(sysctl_udp_mem),
720 .mode = 0644, 597 .mode = 0644,
721 .proc_handler = proc_dointvec_minmax, 598 .proc_handler = proc_dointvec_minmax,
722 .strategy = sysctl_intvec,
723 .extra1 = &zero 599 .extra1 = &zero
724 }, 600 },
725 { 601 {
726 .ctl_name = CTL_UNNUMBERED,
727 .procname = "udp_rmem_min", 602 .procname = "udp_rmem_min",
728 .data = &sysctl_udp_rmem_min, 603 .data = &sysctl_udp_rmem_min,
729 .maxlen = sizeof(sysctl_udp_rmem_min), 604 .maxlen = sizeof(sysctl_udp_rmem_min),
730 .mode = 0644, 605 .mode = 0644,
731 .proc_handler = proc_dointvec_minmax, 606 .proc_handler = proc_dointvec_minmax,
732 .strategy = sysctl_intvec,
733 .extra1 = &zero 607 .extra1 = &zero
734 }, 608 },
735 { 609 {
736 .ctl_name = CTL_UNNUMBERED,
737 .procname = "udp_wmem_min", 610 .procname = "udp_wmem_min",
738 .data = &sysctl_udp_wmem_min, 611 .data = &sysctl_udp_wmem_min,
739 .maxlen = sizeof(sysctl_udp_wmem_min), 612 .maxlen = sizeof(sysctl_udp_wmem_min),
740 .mode = 0644, 613 .mode = 0644,
741 .proc_handler = proc_dointvec_minmax, 614 .proc_handler = proc_dointvec_minmax,
742 .strategy = sysctl_intvec,
743 .extra1 = &zero 615 .extra1 = &zero
744 }, 616 },
745 { .ctl_name = 0 } 617 { }
746}; 618};
747 619
748static struct ctl_table ipv4_net_table[] = { 620static struct ctl_table ipv4_net_table[] = {
749 { 621 {
750 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
751 .procname = "icmp_echo_ignore_all", 622 .procname = "icmp_echo_ignore_all",
752 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, 623 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
753 .maxlen = sizeof(int), 624 .maxlen = sizeof(int),
@@ -755,7 +626,6 @@ static struct ctl_table ipv4_net_table[] = {
755 .proc_handler = proc_dointvec 626 .proc_handler = proc_dointvec
756 }, 627 },
757 { 628 {
758 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
759 .procname = "icmp_echo_ignore_broadcasts", 629 .procname = "icmp_echo_ignore_broadcasts",
760 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, 630 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
761 .maxlen = sizeof(int), 631 .maxlen = sizeof(int),
@@ -763,7 +633,6 @@ static struct ctl_table ipv4_net_table[] = {
763 .proc_handler = proc_dointvec 633 .proc_handler = proc_dointvec
764 }, 634 },
765 { 635 {
766 .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
767 .procname = "icmp_ignore_bogus_error_responses", 636 .procname = "icmp_ignore_bogus_error_responses",
768 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, 637 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
769 .maxlen = sizeof(int), 638 .maxlen = sizeof(int),
@@ -771,7 +640,6 @@ static struct ctl_table ipv4_net_table[] = {
771 .proc_handler = proc_dointvec 640 .proc_handler = proc_dointvec
772 }, 641 },
773 { 642 {
774 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
775 .procname = "icmp_errors_use_inbound_ifaddr", 643 .procname = "icmp_errors_use_inbound_ifaddr",
776 .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, 644 .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
777 .maxlen = sizeof(int), 645 .maxlen = sizeof(int),
@@ -779,16 +647,13 @@ static struct ctl_table ipv4_net_table[] = {
779 .proc_handler = proc_dointvec 647 .proc_handler = proc_dointvec
780 }, 648 },
781 { 649 {
782 .ctl_name = NET_IPV4_ICMP_RATELIMIT,
783 .procname = "icmp_ratelimit", 650 .procname = "icmp_ratelimit",
784 .data = &init_net.ipv4.sysctl_icmp_ratelimit, 651 .data = &init_net.ipv4.sysctl_icmp_ratelimit,
785 .maxlen = sizeof(int), 652 .maxlen = sizeof(int),
786 .mode = 0644, 653 .mode = 0644,
787 .proc_handler = proc_dointvec_ms_jiffies, 654 .proc_handler = proc_dointvec_ms_jiffies,
788 .strategy = sysctl_ms_jiffies
789 }, 655 },
790 { 656 {
791 .ctl_name = NET_IPV4_ICMP_RATEMASK,
792 .procname = "icmp_ratemask", 657 .procname = "icmp_ratemask",
793 .data = &init_net.ipv4.sysctl_icmp_ratemask, 658 .data = &init_net.ipv4.sysctl_icmp_ratemask,
794 .maxlen = sizeof(int), 659 .maxlen = sizeof(int),
@@ -796,7 +661,6 @@ static struct ctl_table ipv4_net_table[] = {
796 .proc_handler = proc_dointvec 661 .proc_handler = proc_dointvec
797 }, 662 },
798 { 663 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "rt_cache_rebuild_count", 664 .procname = "rt_cache_rebuild_count",
801 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, 665 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
802 .maxlen = sizeof(int), 666 .maxlen = sizeof(int),
@@ -807,8 +671,8 @@ static struct ctl_table ipv4_net_table[] = {
807}; 671};
808 672
809struct ctl_path net_ipv4_ctl_path[] = { 673struct ctl_path net_ipv4_ctl_path[] = {
810 { .procname = "net", .ctl_name = CTL_NET, }, 674 { .procname = "net", },
811 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 675 { .procname = "ipv4", },
812 { }, 676 { },
813}; 677};
814EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); 678EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
@@ -818,7 +682,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
818 struct ctl_table *table; 682 struct ctl_table *table;
819 683
820 table = ipv4_net_table; 684 table = ipv4_net_table;
821 if (net != &init_net) { 685 if (!net_eq(net, &init_net)) {
822 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); 686 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
823 if (table == NULL) 687 if (table == NULL)
824 goto err_alloc; 688 goto err_alloc;
@@ -849,7 +713,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
849 return 0; 713 return 0;
850 714
851err_reg: 715err_reg:
852 if (net != &init_net) 716 if (!net_eq(net, &init_net))
853 kfree(table); 717 kfree(table);
854err_alloc: 718err_alloc:
855 return -ENOMEM; 719 return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f1813bc71088..296150b2a62f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,8 @@
264#include <linux/cache.h> 264#include <linux/cache.h>
265#include <linux/err.h> 265#include <linux/err.h>
266#include <linux/crypto.h> 266#include <linux/crypto.h>
267#include <linux/time.h>
268#include <linux/slab.h>
267 269
268#include <net/icmp.h> 270#include <net/icmp.h>
269#include <net/tcp.h> 271#include <net/tcp.h>
@@ -428,7 +430,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
428 if (tp->urg_seq == tp->copied_seq && 430 if (tp->urg_seq == tp->copied_seq &&
429 !sock_flag(sk, SOCK_URGINLINE) && 431 !sock_flag(sk, SOCK_URGINLINE) &&
430 tp->urg_data) 432 tp->urg_data)
431 target--; 433 target++;
432 434
433 /* Potential race condition. If read of tp below will 435 /* Potential race condition. If read of tp below will
434 * escape above sk->sk_state, we can be illegally awaken 436 * escape above sk->sk_state, we can be illegally awaken
@@ -535,8 +537,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
535 tp->nonagle &= ~TCP_NAGLE_PUSH; 537 tp->nonagle &= ~TCP_NAGLE_PUSH;
536} 538}
537 539
538static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 540static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
539 struct sk_buff *skb)
540{ 541{
541 if (flags & MSG_OOB) 542 if (flags & MSG_OOB)
542 tp->snd_up = tp->write_seq; 543 tp->snd_up = tp->write_seq;
@@ -545,13 +546,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
545static inline void tcp_push(struct sock *sk, int flags, int mss_now, 546static inline void tcp_push(struct sock *sk, int flags, int mss_now,
546 int nonagle) 547 int nonagle)
547{ 548{
548 struct tcp_sock *tp = tcp_sk(sk);
549
550 if (tcp_send_head(sk)) { 549 if (tcp_send_head(sk)) {
551 struct sk_buff *skb = tcp_write_queue_tail(sk); 550 struct tcp_sock *tp = tcp_sk(sk);
551
552 if (!(flags & MSG_MORE) || forced_push(tp)) 552 if (!(flags & MSG_MORE) || forced_push(tp))
553 tcp_mark_push(tp, skb); 553 tcp_mark_push(tp, tcp_write_queue_tail(sk));
554 tcp_mark_urg(tp, flags, skb); 554
555 tcp_mark_urg(tp, flags);
555 __tcp_push_pending_frames(sk, mss_now, 556 __tcp_push_pending_frames(sk, mss_now,
556 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 557 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
557 } 558 }
@@ -876,12 +877,12 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
876#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 877#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
877#define TCP_OFF(sk) (sk->sk_sndmsg_off) 878#define TCP_OFF(sk) (sk->sk_sndmsg_off)
878 879
879static inline int select_size(struct sock *sk) 880static inline int select_size(struct sock *sk, int sg)
880{ 881{
881 struct tcp_sock *tp = tcp_sk(sk); 882 struct tcp_sock *tp = tcp_sk(sk);
882 int tmp = tp->mss_cache; 883 int tmp = tp->mss_cache;
883 884
884 if (sk->sk_route_caps & NETIF_F_SG) { 885 if (sg) {
885 if (sk_can_gso(sk)) 886 if (sk_can_gso(sk))
886 tmp = 0; 887 tmp = 0;
887 else { 888 else {
@@ -905,7 +906,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
905 struct sk_buff *skb; 906 struct sk_buff *skb;
906 int iovlen, flags; 907 int iovlen, flags;
907 int mss_now, size_goal; 908 int mss_now, size_goal;
908 int err, copied; 909 int sg, err, copied;
909 long timeo; 910 long timeo;
910 911
911 lock_sock(sk); 912 lock_sock(sk);
@@ -933,6 +934,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
933 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 934 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
934 goto out_err; 935 goto out_err;
935 936
937 sg = sk->sk_route_caps & NETIF_F_SG;
938
936 while (--iovlen >= 0) { 939 while (--iovlen >= 0) {
937 int seglen = iov->iov_len; 940 int seglen = iov->iov_len;
938 unsigned char __user *from = iov->iov_base; 941 unsigned char __user *from = iov->iov_base;
@@ -958,8 +961,9 @@ new_segment:
958 if (!sk_stream_memory_free(sk)) 961 if (!sk_stream_memory_free(sk))
959 goto wait_for_sndbuf; 962 goto wait_for_sndbuf;
960 963
961 skb = sk_stream_alloc_skb(sk, select_size(sk), 964 skb = sk_stream_alloc_skb(sk,
962 sk->sk_allocation); 965 select_size(sk, sg),
966 sk->sk_allocation);
963 if (!skb) 967 if (!skb)
964 goto wait_for_memory; 968 goto wait_for_memory;
965 969
@@ -996,9 +1000,7 @@ new_segment:
996 /* We can extend the last page 1000 /* We can extend the last page
997 * fragment. */ 1001 * fragment. */
998 merge = 1; 1002 merge = 1;
999 } else if (i == MAX_SKB_FRAGS || 1003 } else if (i == MAX_SKB_FRAGS || !sg) {
1000 (!i &&
1001 !(sk->sk_route_caps & NETIF_F_SG))) {
1002 /* Need to add new fragment and cannot 1004 /* Need to add new fragment and cannot
1003 * do this because interface is non-SG, 1005 * do this because interface is non-SG,
1004 * or because all the page slots are 1006 * or because all the page slots are
@@ -1253,6 +1255,39 @@ static void tcp_prequeue_process(struct sock *sk)
1253 tp->ucopy.memory = 0; 1255 tp->ucopy.memory = 0;
1254} 1256}
1255 1257
1258#ifdef CONFIG_NET_DMA
1259static void tcp_service_net_dma(struct sock *sk, bool wait)
1260{
1261 dma_cookie_t done, used;
1262 dma_cookie_t last_issued;
1263 struct tcp_sock *tp = tcp_sk(sk);
1264
1265 if (!tp->ucopy.dma_chan)
1266 return;
1267
1268 last_issued = tp->ucopy.dma_cookie;
1269 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1270
1271 do {
1272 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1273 last_issued, &done,
1274 &used) == DMA_SUCCESS) {
1275 /* Safe to free early-copied skbs now */
1276 __skb_queue_purge(&sk->sk_async_wait_queue);
1277 break;
1278 } else {
1279 struct sk_buff *skb;
1280 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1281 (dma_async_is_complete(skb->dma_cookie, done,
1282 used) == DMA_SUCCESS)) {
1283 __skb_dequeue(&sk->sk_async_wait_queue);
1284 kfree_skb(skb);
1285 }
1286 }
1287 } while (wait);
1288}
1289#endif
1290
1256static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1291static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1257{ 1292{
1258 struct sk_buff *skb; 1293 struct sk_buff *skb;
@@ -1334,6 +1369,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1334 sk_eat_skb(sk, skb, 0); 1369 sk_eat_skb(sk, skb, 0);
1335 if (!desc->count) 1370 if (!desc->count)
1336 break; 1371 break;
1372 tp->copied_seq = seq;
1337 } 1373 }
1338 tp->copied_seq = seq; 1374 tp->copied_seq = seq;
1339 1375
@@ -1545,6 +1581,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1545 /* __ Set realtime policy in scheduler __ */ 1581 /* __ Set realtime policy in scheduler __ */
1546 } 1582 }
1547 1583
1584#ifdef CONFIG_NET_DMA
1585 if (tp->ucopy.dma_chan)
1586 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1587#endif
1548 if (copied >= target) { 1588 if (copied >= target) {
1549 /* Do not sleep, just process backlog. */ 1589 /* Do not sleep, just process backlog. */
1550 release_sock(sk); 1590 release_sock(sk);
@@ -1553,6 +1593,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1553 sk_wait_data(sk, &timeo); 1593 sk_wait_data(sk, &timeo);
1554 1594
1555#ifdef CONFIG_NET_DMA 1595#ifdef CONFIG_NET_DMA
1596 tcp_service_net_dma(sk, false); /* Don't block */
1556 tp->ucopy.wakeup = 0; 1597 tp->ucopy.wakeup = 0;
1557#endif 1598#endif
1558 1599
@@ -1632,6 +1673,9 @@ do_prequeue:
1632 copied = -EFAULT; 1673 copied = -EFAULT;
1633 break; 1674 break;
1634 } 1675 }
1676
1677 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1678
1635 if ((offset + used) == skb->len) 1679 if ((offset + used) == skb->len)
1636 copied_early = 1; 1680 copied_early = 1;
1637 1681
@@ -1701,27 +1745,9 @@ skip_copy:
1701 } 1745 }
1702 1746
1703#ifdef CONFIG_NET_DMA 1747#ifdef CONFIG_NET_DMA
1704 if (tp->ucopy.dma_chan) { 1748 tcp_service_net_dma(sk, true); /* Wait for queue to drain */
1705 dma_cookie_t done, used; 1749 tp->ucopy.dma_chan = NULL;
1706
1707 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1708
1709 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1710 tp->ucopy.dma_cookie, &done,
1711 &used) == DMA_IN_PROGRESS) {
1712 /* do partial cleanup of sk_async_wait_queue */
1713 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1714 (dma_async_is_complete(skb->dma_cookie, done,
1715 used) == DMA_SUCCESS)) {
1716 __skb_dequeue(&sk->sk_async_wait_queue);
1717 kfree_skb(skb);
1718 }
1719 }
1720 1750
1721 /* Safe to free early-copied skbs now */
1722 __skb_queue_purge(&sk->sk_async_wait_queue);
1723 tp->ucopy.dma_chan = NULL;
1724 }
1725 if (tp->ucopy.pinned_list) { 1751 if (tp->ucopy.pinned_list) {
1726 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1752 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1727 tp->ucopy.pinned_list = NULL; 1753 tp->ucopy.pinned_list = NULL;
@@ -2042,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2042 __skb_queue_purge(&sk->sk_async_wait_queue); 2068 __skb_queue_purge(&sk->sk_async_wait_queue);
2043#endif 2069#endif
2044 2070
2045 inet->dport = 0; 2071 inet->inet_dport = 0;
2046 2072
2047 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 2073 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2048 inet_reset_saddr(sk); 2074 inet_reset_saddr(sk);
@@ -2059,6 +2085,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2059 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 2085 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2060 tp->snd_cwnd_cnt = 0; 2086 tp->snd_cwnd_cnt = 0;
2061 tp->bytes_acked = 0; 2087 tp->bytes_acked = 0;
2088 tp->window_clamp = 0;
2062 tcp_set_ca_state(sk, TCP_CA_Open); 2089 tcp_set_ca_state(sk, TCP_CA_Open);
2063 tcp_clear_retrans(tp); 2090 tcp_clear_retrans(tp);
2064 inet_csk_delack_init(sk); 2091 inet_csk_delack_init(sk);
@@ -2066,7 +2093,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2066 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2093 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2067 __sk_dst_reset(sk); 2094 __sk_dst_reset(sk);
2068 2095
2069 WARN_ON(inet->num && !icsk->icsk_bind_hash); 2096 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2070 2097
2071 sk->sk_error_report(sk); 2098 sk->sk_error_report(sk);
2072 return err; 2099 return err;
@@ -2083,8 +2110,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2083 int val; 2110 int val;
2084 int err = 0; 2111 int err = 0;
2085 2112
2086 /* This is a string value all the others are int's */ 2113 /* These are data/string values, all the others are ints */
2087 if (optname == TCP_CONGESTION) { 2114 switch (optname) {
2115 case TCP_CONGESTION: {
2088 char name[TCP_CA_NAME_MAX]; 2116 char name[TCP_CA_NAME_MAX];
2089 2117
2090 if (optlen < 1) 2118 if (optlen < 1)
@@ -2101,6 +2129,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2101 release_sock(sk); 2129 release_sock(sk);
2102 return err; 2130 return err;
2103 } 2131 }
2132 case TCP_COOKIE_TRANSACTIONS: {
2133 struct tcp_cookie_transactions ctd;
2134 struct tcp_cookie_values *cvp = NULL;
2135
2136 if (sizeof(ctd) > optlen)
2137 return -EINVAL;
2138 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2139 return -EFAULT;
2140
2141 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2142 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2143 return -EINVAL;
2144
2145 if (ctd.tcpct_cookie_desired == 0) {
2146 /* default to global value */
2147 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2148 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2149 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2150 return -EINVAL;
2151 }
2152
2153 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2154 /* Supercedes all other values */
2155 lock_sock(sk);
2156 if (tp->cookie_values != NULL) {
2157 kref_put(&tp->cookie_values->kref,
2158 tcp_cookie_values_release);
2159 tp->cookie_values = NULL;
2160 }
2161 tp->rx_opt.cookie_in_always = 0; /* false */
2162 tp->rx_opt.cookie_out_never = 1; /* true */
2163 release_sock(sk);
2164 return err;
2165 }
2166
2167 /* Allocate ancillary memory before locking.
2168 */
2169 if (ctd.tcpct_used > 0 ||
2170 (tp->cookie_values == NULL &&
2171 (sysctl_tcp_cookie_size > 0 ||
2172 ctd.tcpct_cookie_desired > 0 ||
2173 ctd.tcpct_s_data_desired > 0))) {
2174 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2175 GFP_KERNEL);
2176 if (cvp == NULL)
2177 return -ENOMEM;
2178 }
2179 lock_sock(sk);
2180 tp->rx_opt.cookie_in_always =
2181 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2182 tp->rx_opt.cookie_out_never = 0; /* false */
2183
2184 if (tp->cookie_values != NULL) {
2185 if (cvp != NULL) {
2186 /* Changed values are recorded by a changed
2187 * pointer, ensuring the cookie will differ,
2188 * without separately hashing each value later.
2189 */
2190 kref_put(&tp->cookie_values->kref,
2191 tcp_cookie_values_release);
2192 kref_init(&cvp->kref);
2193 tp->cookie_values = cvp;
2194 } else {
2195 cvp = tp->cookie_values;
2196 }
2197 }
2198 if (cvp != NULL) {
2199 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2200
2201 if (ctd.tcpct_used > 0) {
2202 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2203 ctd.tcpct_used);
2204 cvp->s_data_desired = ctd.tcpct_used;
2205 cvp->s_data_constant = 1; /* true */
2206 } else {
2207 /* No constant payload data. */
2208 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2209 cvp->s_data_constant = 0; /* false */
2210 }
2211 }
2212 release_sock(sk);
2213 return err;
2214 }
2215 default:
2216 /* fallthru */
2217 break;
2218 };
2104 2219
2105 if (optlen < sizeof(int)) 2220 if (optlen < sizeof(int))
2106 return -EINVAL; 2221 return -EINVAL;
@@ -2139,6 +2254,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2139 } 2254 }
2140 break; 2255 break;
2141 2256
2257 case TCP_THIN_LINEAR_TIMEOUTS:
2258 if (val < 0 || val > 1)
2259 err = -EINVAL;
2260 else
2261 tp->thin_lto = val;
2262 break;
2263
2264 case TCP_THIN_DUPACK:
2265 if (val < 0 || val > 1)
2266 err = -EINVAL;
2267 else
2268 tp->thin_dupack = val;
2269 break;
2270
2142 case TCP_CORK: 2271 case TCP_CORK:
2143 /* When set indicates to always queue non-full frames. 2272 /* When set indicates to always queue non-full frames.
2144 * Later the user clears this option and we transmit 2273 * Later the user clears this option and we transmit
@@ -2425,6 +2554,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2425 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) 2554 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2426 return -EFAULT; 2555 return -EFAULT;
2427 return 0; 2556 return 0;
2557
2558 case TCP_COOKIE_TRANSACTIONS: {
2559 struct tcp_cookie_transactions ctd;
2560 struct tcp_cookie_values *cvp = tp->cookie_values;
2561
2562 if (get_user(len, optlen))
2563 return -EFAULT;
2564 if (len < sizeof(ctd))
2565 return -EINVAL;
2566
2567 memset(&ctd, 0, sizeof(ctd));
2568 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2569 TCP_COOKIE_IN_ALWAYS : 0)
2570 | (tp->rx_opt.cookie_out_never ?
2571 TCP_COOKIE_OUT_NEVER : 0);
2572
2573 if (cvp != NULL) {
2574 ctd.tcpct_flags |= (cvp->s_data_in ?
2575 TCP_S_DATA_IN : 0)
2576 | (cvp->s_data_out ?
2577 TCP_S_DATA_OUT : 0);
2578
2579 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2580 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2581
2582 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2583 cvp->cookie_pair_size);
2584 ctd.tcpct_used = cvp->cookie_pair_size;
2585 }
2586
2587 if (put_user(sizeof(ctd), optlen))
2588 return -EFAULT;
2589 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2590 return -EFAULT;
2591 return 0;
2592 }
2428 default: 2593 default:
2429 return -ENOPROTOOPT; 2594 return -ENOPROTOOPT;
2430 } 2595 }
@@ -2662,10 +2827,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
2662 2827
2663#ifdef CONFIG_TCP_MD5SIG 2828#ifdef CONFIG_TCP_MD5SIG
2664static unsigned long tcp_md5sig_users; 2829static unsigned long tcp_md5sig_users;
2665static struct tcp_md5sig_pool **tcp_md5sig_pool; 2830static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
2666static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2831static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2667 2832
2668static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) 2833static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
2669{ 2834{
2670 int cpu; 2835 int cpu;
2671 for_each_possible_cpu(cpu) { 2836 for_each_possible_cpu(cpu) {
@@ -2674,7 +2839,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2674 if (p->md5_desc.tfm) 2839 if (p->md5_desc.tfm)
2675 crypto_free_hash(p->md5_desc.tfm); 2840 crypto_free_hash(p->md5_desc.tfm);
2676 kfree(p); 2841 kfree(p);
2677 p = NULL;
2678 } 2842 }
2679 } 2843 }
2680 free_percpu(pool); 2844 free_percpu(pool);
@@ -2682,7 +2846,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2682 2846
2683void tcp_free_md5sig_pool(void) 2847void tcp_free_md5sig_pool(void)
2684{ 2848{
2685 struct tcp_md5sig_pool **pool = NULL; 2849 struct tcp_md5sig_pool * __percpu *pool = NULL;
2686 2850
2687 spin_lock_bh(&tcp_md5sig_pool_lock); 2851 spin_lock_bh(&tcp_md5sig_pool_lock);
2688 if (--tcp_md5sig_users == 0) { 2852 if (--tcp_md5sig_users == 0) {
@@ -2696,10 +2860,11 @@ void tcp_free_md5sig_pool(void)
2696 2860
2697EXPORT_SYMBOL(tcp_free_md5sig_pool); 2861EXPORT_SYMBOL(tcp_free_md5sig_pool);
2698 2862
2699static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) 2863static struct tcp_md5sig_pool * __percpu *
2864__tcp_alloc_md5sig_pool(struct sock *sk)
2700{ 2865{
2701 int cpu; 2866 int cpu;
2702 struct tcp_md5sig_pool **pool; 2867 struct tcp_md5sig_pool * __percpu *pool;
2703 2868
2704 pool = alloc_percpu(struct tcp_md5sig_pool *); 2869 pool = alloc_percpu(struct tcp_md5sig_pool *);
2705 if (!pool) 2870 if (!pool)
@@ -2726,9 +2891,9 @@ out_free:
2726 return NULL; 2891 return NULL;
2727} 2892}
2728 2893
2729struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) 2894struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
2730{ 2895{
2731 struct tcp_md5sig_pool **pool; 2896 struct tcp_md5sig_pool * __percpu *pool;
2732 int alloc = 0; 2897 int alloc = 0;
2733 2898
2734retry: 2899retry:
@@ -2747,7 +2912,9 @@ retry:
2747 2912
2748 if (alloc) { 2913 if (alloc) {
2749 /* we cannot hold spinlock here because this may sleep. */ 2914 /* we cannot hold spinlock here because this may sleep. */
2750 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); 2915 struct tcp_md5sig_pool * __percpu *p;
2916
2917 p = __tcp_alloc_md5sig_pool(sk);
2751 spin_lock_bh(&tcp_md5sig_pool_lock); 2918 spin_lock_bh(&tcp_md5sig_pool_lock);
2752 if (!p) { 2919 if (!p) {
2753 tcp_md5sig_users--; 2920 tcp_md5sig_users--;
@@ -2769,25 +2936,40 @@ retry:
2769 2936
2770EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2937EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2771 2938
2772struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) 2939
2940/**
2941 * tcp_get_md5sig_pool - get md5sig_pool for this user
2942 *
2943 * We use percpu structure, so if we succeed, we exit with preemption
2944 * and BH disabled, to make sure another thread or softirq handling
2945 * wont try to get same context.
2946 */
2947struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2773{ 2948{
2774 struct tcp_md5sig_pool **p; 2949 struct tcp_md5sig_pool * __percpu *p;
2775 spin_lock_bh(&tcp_md5sig_pool_lock); 2950
2951 local_bh_disable();
2952
2953 spin_lock(&tcp_md5sig_pool_lock);
2776 p = tcp_md5sig_pool; 2954 p = tcp_md5sig_pool;
2777 if (p) 2955 if (p)
2778 tcp_md5sig_users++; 2956 tcp_md5sig_users++;
2779 spin_unlock_bh(&tcp_md5sig_pool_lock); 2957 spin_unlock(&tcp_md5sig_pool_lock);
2780 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2781}
2782 2958
2783EXPORT_SYMBOL(__tcp_get_md5sig_pool); 2959 if (p)
2960 return *per_cpu_ptr(p, smp_processor_id());
2961
2962 local_bh_enable();
2963 return NULL;
2964}
2965EXPORT_SYMBOL(tcp_get_md5sig_pool);
2784 2966
2785void __tcp_put_md5sig_pool(void) 2967void tcp_put_md5sig_pool(void)
2786{ 2968{
2969 local_bh_enable();
2787 tcp_free_md5sig_pool(); 2970 tcp_free_md5sig_pool();
2788} 2971}
2789 2972EXPORT_SYMBOL(tcp_put_md5sig_pool);
2790EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2791 2973
2792int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 2974int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2793 struct tcphdr *th) 2975 struct tcphdr *th)
@@ -2847,6 +3029,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
2847 3029
2848#endif 3030#endif
2849 3031
3032/**
3033 * Each Responder maintains up to two secret values concurrently for
3034 * efficient secret rollover. Each secret value has 4 states:
3035 *
3036 * Generating. (tcp_secret_generating != tcp_secret_primary)
3037 * Generates new Responder-Cookies, but not yet used for primary
3038 * verification. This is a short-term state, typically lasting only
3039 * one round trip time (RTT).
3040 *
3041 * Primary. (tcp_secret_generating == tcp_secret_primary)
3042 * Used both for generation and primary verification.
3043 *
3044 * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
3045 * Used for verification, until the first failure that can be
3046 * verified by the newer Generating secret. At that time, this
3047 * cookie's state is changed to Secondary, and the Generating
3048 * cookie's state is changed to Primary. This is a short-term state,
3049 * typically lasting only one round trip time (RTT).
3050 *
3051 * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
3052 * Used for secondary verification, after primary verification
3053 * failures. This state lasts no more than twice the Maximum Segment
3054 * Lifetime (2MSL). Then, the secret is discarded.
3055 */
3056struct tcp_cookie_secret {
3057 /* The secret is divided into two parts. The digest part is the
3058 * equivalent of previously hashing a secret and saving the state,
3059 * and serves as an initialization vector (IV). The message part
3060 * serves as the trailing secret.
3061 */
3062 u32 secrets[COOKIE_WORKSPACE_WORDS];
3063 unsigned long expires;
3064};
3065
3066#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3067#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3068#define TCP_SECRET_LIFE (HZ * 600)
3069
3070static struct tcp_cookie_secret tcp_secret_one;
3071static struct tcp_cookie_secret tcp_secret_two;
3072
3073/* Essentially a circular list, without dynamic allocation. */
3074static struct tcp_cookie_secret *tcp_secret_generating;
3075static struct tcp_cookie_secret *tcp_secret_primary;
3076static struct tcp_cookie_secret *tcp_secret_retiring;
3077static struct tcp_cookie_secret *tcp_secret_secondary;
3078
3079static DEFINE_SPINLOCK(tcp_secret_locker);
3080
3081/* Select a pseudo-random word in the cookie workspace.
3082 */
3083static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3084{
3085 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3086}
3087
3088/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
3089 * Called in softirq context.
3090 * Returns: 0 for success.
3091 */
3092int tcp_cookie_generator(u32 *bakery)
3093{
3094 unsigned long jiffy = jiffies;
3095
3096 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3097 spin_lock_bh(&tcp_secret_locker);
3098 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3099 /* refreshed by another */
3100 memcpy(bakery,
3101 &tcp_secret_generating->secrets[0],
3102 COOKIE_WORKSPACE_WORDS);
3103 } else {
3104 /* still needs refreshing */
3105 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3106
3107 /* The first time, paranoia assumes that the
3108 * randomization function isn't as strong. But,
3109 * this secret initialization is delayed until
3110 * the last possible moment (packet arrival).
3111 * Although that time is observable, it is
3112 * unpredictably variable. Mash in the most
3113 * volatile clock bits available, and expire the
3114 * secret extra quickly.
3115 */
3116 if (unlikely(tcp_secret_primary->expires ==
3117 tcp_secret_secondary->expires)) {
3118 struct timespec tv;
3119
3120 getnstimeofday(&tv);
3121 bakery[COOKIE_DIGEST_WORDS+0] ^=
3122 (u32)tv.tv_nsec;
3123
3124 tcp_secret_secondary->expires = jiffy
3125 + TCP_SECRET_1MSL
3126 + (0x0f & tcp_cookie_work(bakery, 0));
3127 } else {
3128 tcp_secret_secondary->expires = jiffy
3129 + TCP_SECRET_LIFE
3130 + (0xff & tcp_cookie_work(bakery, 1));
3131 tcp_secret_primary->expires = jiffy
3132 + TCP_SECRET_2MSL
3133 + (0x1f & tcp_cookie_work(bakery, 2));
3134 }
3135 memcpy(&tcp_secret_secondary->secrets[0],
3136 bakery, COOKIE_WORKSPACE_WORDS);
3137
3138 rcu_assign_pointer(tcp_secret_generating,
3139 tcp_secret_secondary);
3140 rcu_assign_pointer(tcp_secret_retiring,
3141 tcp_secret_primary);
3142 /*
3143 * Neither call_rcu() nor synchronize_rcu() needed.
3144 * Retiring data is not freed. It is replaced after
3145 * further (locked) pointer updates, and a quiet time
3146 * (minimum 1MSL, maximum LIFE - 2MSL).
3147 */
3148 }
3149 spin_unlock_bh(&tcp_secret_locker);
3150 } else {
3151 rcu_read_lock_bh();
3152 memcpy(bakery,
3153 &rcu_dereference(tcp_secret_generating)->secrets[0],
3154 COOKIE_WORKSPACE_WORDS);
3155 rcu_read_unlock_bh();
3156 }
3157 return 0;
3158}
3159EXPORT_SYMBOL(tcp_cookie_generator);
3160
2850void tcp_done(struct sock *sk) 3161void tcp_done(struct sock *sk)
2851{ 3162{
2852 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3163 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2881,6 +3192,7 @@ void __init tcp_init(void)
2881 struct sk_buff *skb = NULL; 3192 struct sk_buff *skb = NULL;
2882 unsigned long nr_pages, limit; 3193 unsigned long nr_pages, limit;
2883 int order, i, max_share; 3194 int order, i, max_share;
3195 unsigned long jiffy = jiffies;
2884 3196
2885 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3197 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2886 3198
@@ -2903,11 +3215,10 @@ void __init tcp_init(void)
2903 (totalram_pages >= 128 * 1024) ? 3215 (totalram_pages >= 128 * 1024) ?
2904 13 : 15, 3216 13 : 15,
2905 0, 3217 0,
2906 &tcp_hashinfo.ehash_size,
2907 NULL, 3218 NULL,
3219 &tcp_hashinfo.ehash_mask,
2908 thash_entries ? 0 : 512 * 1024); 3220 thash_entries ? 0 : 512 * 1024);
2909 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 3221 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
2910 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2911 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3222 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2912 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 3223 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2913 } 3224 }
@@ -2916,7 +3227,7 @@ void __init tcp_init(void)
2916 tcp_hashinfo.bhash = 3227 tcp_hashinfo.bhash =
2917 alloc_large_system_hash("TCP bind", 3228 alloc_large_system_hash("TCP bind",
2918 sizeof(struct inet_bind_hashbucket), 3229 sizeof(struct inet_bind_hashbucket),
2919 tcp_hashinfo.ehash_size, 3230 tcp_hashinfo.ehash_mask + 1,
2920 (totalram_pages >= 128 * 1024) ? 3231 (totalram_pages >= 128 * 1024) ?
2921 13 : 15, 3232 13 : 15,
2922 0, 3233 0,
@@ -2971,10 +3282,19 @@ void __init tcp_init(void)
2971 sysctl_tcp_rmem[2] = max(87380, max_share); 3282 sysctl_tcp_rmem[2] = max(87380, max_share);
2972 3283
2973 printk(KERN_INFO "TCP: Hash tables configured " 3284 printk(KERN_INFO "TCP: Hash tables configured "
2974 "(established %d bind %d)\n", 3285 "(established %u bind %u)\n",
2975 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); 3286 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
2976 3287
2977 tcp_register_congestion_control(&tcp_reno); 3288 tcp_register_congestion_control(&tcp_reno);
3289
3290 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3291 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3292 tcp_secret_one.expires = jiffy; /* past due */
3293 tcp_secret_two.expires = jiffy; /* past due */
3294 tcp_secret_generating = &tcp_secret_one;
3295 tcp_secret_primary = &tcp_secret_one;
3296 tcp_secret_retiring = &tcp_secret_two;
3297 tcp_secret_secondary = &tcp_secret_two;
2978} 3298}
2979 3299
2980EXPORT_SYMBOL(tcp_close); 3300EXPORT_SYMBOL(tcp_close);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6428b342b164..0ec9bd0ae94f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/gfp.h>
13#include <net/tcp.h> 14#include <net/tcp.h>
14 15
15int sysctl_tcp_max_ssthresh = 0; 16int sysctl_tcp_max_ssthresh = 0;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index fcbcd4ff6c5f..939edb3b8e4d 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -27,7 +27,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
27 r->idiag_rqueue = sk->sk_ack_backlog; 27 r->idiag_rqueue = sk->sk_ack_backlog;
28 r->idiag_wqueue = sk->sk_max_ack_backlog; 28 r->idiag_wqueue = sk->sk_max_ack_backlog;
29 } else { 29 } else {
30 r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; 30 r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
31 r->idiag_wqueue = tp->write_seq - tp->snd_una; 31 r->idiag_wqueue = tp->write_seq - tp->snd_una;
32 } 32 }
33 if (info != NULL) 33 if (info != NULL)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 26d5c7fc7de5..7c94a4955416 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
92 if (icsk->icsk_ca_state == TCP_CA_Open) { 92 if (icsk->icsk_ca_state == TCP_CA_Open) {
93 if (ca->maxRTT < ca->minRTT) 93 if (ca->maxRTT < ca->minRTT)
94 ca->maxRTT = ca->minRTT; 94 ca->maxRTT = ca->minRTT;
95 if (ca->maxRTT < srtt 95 if (ca->maxRTT < srtt &&
96 && srtt <= ca->maxRTT + msecs_to_jiffies(20)) 96 srtt <= ca->maxRTT + msecs_to_jiffies(20))
97 ca->maxRTT = srtt; 97 ca->maxRTT = srtt;
98 } 98 }
99} 99}
@@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
123 123
124 ca->packetcount += pkts_acked; 124 ca->packetcount += pkts_acked;
125 125
126 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) 126 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
127 && now - ca->lasttime >= ca->minRTT 127 now - ca->lasttime >= ca->minRTT &&
128 && ca->minRTT > 0) { 128 ca->minRTT > 0) {
129 __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); 129 __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
130 130
131 if (htcp_ccount(ca) <= 3) { 131 if (htcp_ccount(ca) <= 3) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784be7ab3..f240f57b2199 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
62 */ 62 */
63 63
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/slab.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/sysctl.h> 67#include <linux/sysctl.h>
67#include <linux/kernel.h> 68#include <linux/kernel.h>
@@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2;
89int sysctl_tcp_frto_response __read_mostly; 90int sysctl_tcp_frto_response __read_mostly;
90int sysctl_tcp_nometrics_save __read_mostly; 91int sysctl_tcp_nometrics_save __read_mostly;
91 92
93int sysctl_tcp_thin_dupack __read_mostly;
94
92int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 95int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
93int sysctl_tcp_abc __read_mostly; 96int sysctl_tcp_abc __read_mostly;
94 97
@@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
140 * "len" is invariant segment length, including TCP header. 143 * "len" is invariant segment length, including TCP header.
141 */ 144 */
142 len += skb->data - skb_transport_header(skb); 145 len += skb->data - skb_transport_header(skb);
143 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || 146 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
144 /* If PSH is not set, packet should be 147 /* If PSH is not set, packet should be
145 * full sized, provided peer TCP is not badly broken. 148 * full sized, provided peer TCP is not badly broken.
146 * This observation (if it is correct 8)) allows 149 * This observation (if it is correct 8)) allows
@@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
411 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); 414 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
412 415
413 hint = min(hint, tp->rcv_wnd / 2); 416 hint = min(hint, tp->rcv_wnd / 2);
414 hint = min(hint, TCP_MIN_RCVMSS); 417 hint = min(hint, TCP_MSS_DEFAULT);
415 hint = max(hint, TCP_MIN_MSS); 418 hint = max(hint, TCP_MIN_MSS);
416 419
417 inet_csk(sk)->icsk_ack.rcv_mss = hint; 420 inet_csk(sk)->icsk_ack.rcv_mss = hint;
@@ -2300,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
2300 * they differ. Since neither occurs due to loss, TCP should really 2303 * they differ. Since neither occurs due to loss, TCP should really
2301 * ignore them. 2304 * ignore them.
2302 */ 2305 */
2303static inline int tcp_dupack_heurestics(struct tcp_sock *tp) 2306static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2304{ 2307{
2305 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2308 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2306} 2309}
@@ -2425,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk)
2425 return 1; 2428 return 1;
2426 2429
2427 /* Not-A-Trick#2 : Classic rule... */ 2430 /* Not-A-Trick#2 : Classic rule... */
2428 if (tcp_dupack_heurestics(tp) > tp->reordering) 2431 if (tcp_dupack_heuristics(tp) > tp->reordering)
2429 return 1; 2432 return 1;
2430 2433
2431 /* Trick#3 : when we use RFC2988 timer restart, fast 2434 /* Trick#3 : when we use RFC2988 timer restart, fast
@@ -2447,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk)
2447 return 1; 2450 return 1;
2448 } 2451 }
2449 2452
2453 /* If a thin stream is detected, retransmit after first
2454 * received dupack. Employ only if SACK is supported in order
2455 * to avoid possible corner-case series of spurious retransmissions
2456 * Use only if there are no unsent data.
2457 */
2458 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2459 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2460 tcp_is_sack(tp) && !tcp_send_head(sk))
2461 return 1;
2462
2450 return 0; 2463 return 0;
2451} 2464}
2452 2465
@@ -2499,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2499 int err; 2512 int err;
2500 unsigned int mss; 2513 unsigned int mss;
2501 2514
2515 if (packets == 0)
2516 return;
2517
2502 WARN_ON(packets > tp->packets_out); 2518 WARN_ON(packets > tp->packets_out);
2503 if (tp->lost_skb_hint) { 2519 if (tp->lost_skb_hint) {
2504 skb = tp->lost_skb_hint; 2520 skb = tp->lost_skb_hint;
@@ -2717,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk)
2717 } 2733 }
2718} 2734}
2719 2735
2736/* We can clear retrans_stamp when there are no retransmissions in the
2737 * window. It would seem that it is trivially available for us in
2738 * tp->retrans_out, however, that kind of assumptions doesn't consider
2739 * what will happen if errors occur when sending retransmission for the
2740 * second time. ...It could the that such segment has only
2741 * TCPCB_EVER_RETRANS set at the present time. It seems that checking
2742 * the head skb is enough except for some reneging corner cases that
2743 * are not worth the effort.
2744 *
2745 * Main reason for all this complexity is the fact that connection dying
2746 * time now depends on the validity of the retrans_stamp, in particular,
2747 * that successive retransmissions of a segment must not advance
2748 * retrans_stamp under any conditions.
2749 */
2750static int tcp_any_retrans_done(struct sock *sk)
2751{
2752 struct tcp_sock *tp = tcp_sk(sk);
2753 struct sk_buff *skb;
2754
2755 if (tp->retrans_out)
2756 return 1;
2757
2758 skb = tcp_write_queue_head(sk);
2759 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2760 return 1;
2761
2762 return 0;
2763}
2764
2720/* Undo during fast recovery after partial ACK. */ 2765/* Undo during fast recovery after partial ACK. */
2721 2766
2722static int tcp_try_undo_partial(struct sock *sk, int acked) 2767static int tcp_try_undo_partial(struct sock *sk, int acked)
@@ -2729,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2729 /* Plain luck! Hole if filled with delayed 2774 /* Plain luck! Hole if filled with delayed
2730 * packet, rather than with a retransmit. 2775 * packet, rather than with a retransmit.
2731 */ 2776 */
2732 if (tp->retrans_out == 0) 2777 if (!tcp_any_retrans_done(sk))
2733 tp->retrans_stamp = 0; 2778 tp->retrans_stamp = 0;
2734 2779
2735 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2780 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
@@ -2788,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk)
2788 struct tcp_sock *tp = tcp_sk(sk); 2833 struct tcp_sock *tp = tcp_sk(sk);
2789 int state = TCP_CA_Open; 2834 int state = TCP_CA_Open;
2790 2835
2791 if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) 2836 if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
2792 state = TCP_CA_Disorder; 2837 state = TCP_CA_Disorder;
2793 2838
2794 if (inet_csk(sk)->icsk_ca_state != state) { 2839 if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2803,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
2803 2848
2804 tcp_verify_left_out(tp); 2849 tcp_verify_left_out(tp);
2805 2850
2806 if (!tp->frto_counter && tp->retrans_out == 0) 2851 if (!tp->frto_counter && !tcp_any_retrans_done(sk))
2807 tp->retrans_stamp = 0; 2852 tp->retrans_stamp = 0;
2808 2853
2809 if (flag & FLAG_ECE) 2854 if (flag & FLAG_ECE)
@@ -3698,7 +3743,7 @@ old_ack:
3698 * the fast version below fails. 3743 * the fast version below fails.
3699 */ 3744 */
3700void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, 3745void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3701 int estab) 3746 u8 **hvpp, int estab)
3702{ 3747{
3703 unsigned char *ptr; 3748 unsigned char *ptr;
3704 struct tcphdr *th = tcp_hdr(skb); 3749 struct tcphdr *th = tcp_hdr(skb);
@@ -3782,7 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3782 */ 3827 */
3783 break; 3828 break;
3784#endif 3829#endif
3785 } 3830 case TCPOPT_COOKIE:
3831 /* This option is variable length.
3832 */
3833 switch (opsize) {
3834 case TCPOLEN_COOKIE_BASE:
3835 /* not yet implemented */
3836 break;
3837 case TCPOLEN_COOKIE_PAIR:
3838 /* not yet implemented */
3839 break;
3840 case TCPOLEN_COOKIE_MIN+0:
3841 case TCPOLEN_COOKIE_MIN+2:
3842 case TCPOLEN_COOKIE_MIN+4:
3843 case TCPOLEN_COOKIE_MIN+6:
3844 case TCPOLEN_COOKIE_MAX:
3845 /* 16-bit multiple */
3846 opt_rx->cookie_plus = opsize;
3847 *hvpp = ptr;
3848 default:
3849 /* ignore option */
3850 break;
3851 };
3852 break;
3853 };
3786 3854
3787 ptr += opsize-2; 3855 ptr += opsize-2;
3788 length -= opsize; 3856 length -= opsize;
@@ -3810,17 +3878,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3810 * If it is wrong it falls back on tcp_parse_options(). 3878 * If it is wrong it falls back on tcp_parse_options().
3811 */ 3879 */
3812static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 3880static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3813 struct tcp_sock *tp) 3881 struct tcp_sock *tp, u8 **hvpp)
3814{ 3882{
3815 if (th->doff == sizeof(struct tcphdr) >> 2) { 3883 /* In the spirit of fast parsing, compare doff directly to constant
3884 * values. Because equality is used, short doff can be ignored here.
3885 */
3886 if (th->doff == (sizeof(*th) / 4)) {
3816 tp->rx_opt.saw_tstamp = 0; 3887 tp->rx_opt.saw_tstamp = 0;
3817 return 0; 3888 return 0;
3818 } else if (tp->rx_opt.tstamp_ok && 3889 } else if (tp->rx_opt.tstamp_ok &&
3819 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3890 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3820 if (tcp_parse_aligned_timestamp(tp, th)) 3891 if (tcp_parse_aligned_timestamp(tp, th))
3821 return 1; 3892 return 1;
3822 } 3893 }
3823 tcp_parse_options(skb, &tp->rx_opt, 1); 3894 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
3824 return 1; 3895 return 1;
3825} 3896}
3826 3897
@@ -4845,11 +4916,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4845 struct tcp_sock *tp = tcp_sk(sk); 4916 struct tcp_sock *tp = tcp_sk(sk);
4846 4917
4847 /* More than one full frame received... */ 4918 /* More than one full frame received... */
4848 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss 4919 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4849 /* ... and right edge of window advances far enough. 4920 /* ... and right edge of window advances far enough.
4850 * (tcp_recvmsg() will send ACK otherwise). Or... 4921 * (tcp_recvmsg() will send ACK otherwise). Or...
4851 */ 4922 */
4852 && __tcp_select_window(sk) >= tp->rcv_wnd) || 4923 __tcp_select_window(sk) >= tp->rcv_wnd) ||
4853 /* We ACK each frame or... */ 4924 /* We ACK each frame or... */
4854 tcp_in_quickack_mode(sk) || 4925 tcp_in_quickack_mode(sk) ||
4855 /* We have out of order data. */ 4926 /* We have out of order data. */
@@ -5070,10 +5141,12 @@ out:
5070static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5141static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5071 struct tcphdr *th, int syn_inerr) 5142 struct tcphdr *th, int syn_inerr)
5072{ 5143{
5144 u8 *hash_location;
5073 struct tcp_sock *tp = tcp_sk(sk); 5145 struct tcp_sock *tp = tcp_sk(sk);
5074 5146
5075 /* RFC1323: H1. Apply PAWS check first. */ 5147 /* RFC1323: H1. Apply PAWS check first. */
5076 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5148 if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
5149 tp->rx_opt.saw_tstamp &&
5077 tcp_paws_discard(sk, skb)) { 5150 tcp_paws_discard(sk, skb)) {
5078 if (!th->rst) { 5151 if (!th->rst) {
5079 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 5152 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5361,11 +5434,13 @@ discard:
5361static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5434static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5362 struct tcphdr *th, unsigned len) 5435 struct tcphdr *th, unsigned len)
5363{ 5436{
5364 struct tcp_sock *tp = tcp_sk(sk); 5437 u8 *hash_location;
5365 struct inet_connection_sock *icsk = inet_csk(sk); 5438 struct inet_connection_sock *icsk = inet_csk(sk);
5439 struct tcp_sock *tp = tcp_sk(sk);
5440 struct tcp_cookie_values *cvp = tp->cookie_values;
5366 int saved_clamp = tp->rx_opt.mss_clamp; 5441 int saved_clamp = tp->rx_opt.mss_clamp;
5367 5442
5368 tcp_parse_options(skb, &tp->rx_opt, 0); 5443 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
5369 5444
5370 if (th->ack) { 5445 if (th->ack) {
5371 /* rfc793: 5446 /* rfc793:
@@ -5462,6 +5537,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5462 * Change state from SYN-SENT only after copied_seq 5537 * Change state from SYN-SENT only after copied_seq
5463 * is initialized. */ 5538 * is initialized. */
5464 tp->copied_seq = tp->rcv_nxt; 5539 tp->copied_seq = tp->rcv_nxt;
5540
5541 if (cvp != NULL &&
5542 cvp->cookie_pair_size > 0 &&
5543 tp->rx_opt.cookie_plus > 0) {
5544 int cookie_size = tp->rx_opt.cookie_plus
5545 - TCPOLEN_COOKIE_BASE;
5546 int cookie_pair_size = cookie_size
5547 + cvp->cookie_desired;
5548
5549 /* A cookie extension option was sent and returned.
5550 * Note that each incoming SYNACK replaces the
5551 * Responder cookie. The initial exchange is most
5552 * fragile, as protection against spoofing relies
5553 * entirely upon the sequence and timestamp (above).
5554 * This replacement strategy allows the correct pair to
5555 * pass through, while any others will be filtered via
5556 * Responder verification later.
5557 */
5558 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5559 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5560 hash_location, cookie_size);
5561 cvp->cookie_pair_size = cookie_pair_size;
5562 }
5563 }
5564
5465 smp_mb(); 5565 smp_mb();
5466 tcp_set_state(sk, TCP_ESTABLISHED); 5566 tcp_set_state(sk, TCP_ESTABLISHED);
5467 5567
@@ -5699,11 +5799,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5699 5799
5700 /* tcp_ack considers this ACK as duplicate 5800 /* tcp_ack considers this ACK as duplicate
5701 * and does not calculate rtt. 5801 * and does not calculate rtt.
5702 * Fix it at least with timestamps. 5802 * Force it here.
5703 */ 5803 */
5704 if (tp->rx_opt.saw_tstamp && 5804 tcp_ack_update_rtt(sk, 0, 0);
5705 tp->rx_opt.rcv_tsecr && !tp->srtt)
5706 tcp_ack_saw_tstamp(sk, 0);
5707 5805
5708 if (tp->rx_opt.tstamp_ok) 5806 if (tp->rx_opt.tstamp_ok)
5709 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5807 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b53f61..3c23e70885f4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -60,6 +60,7 @@
60#include <linux/jhash.h> 60#include <linux/jhash.h>
61#include <linux/init.h> 61#include <linux/init.h>
62#include <linux/times.h> 62#include <linux/times.h>
63#include <linux/slab.h>
63 64
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65#include <net/icmp.h> 66#include <net/icmp.h>
@@ -165,10 +166,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
165 nexthop = inet->opt->faddr; 166 nexthop = inet->opt->faddr;
166 } 167 }
167 168
168 tmp = ip_route_connect(&rt, nexthop, inet->saddr, 169 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 170 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170 IPPROTO_TCP, 171 IPPROTO_TCP,
171 inet->sport, usin->sin_port, sk, 1); 172 inet->inet_sport, usin->sin_port, sk, 1);
172 if (tmp < 0) { 173 if (tmp < 0) {
173 if (tmp == -ENETUNREACH) 174 if (tmp == -ENETUNREACH)
174 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 175 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -183,11 +184,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
183 if (!inet->opt || !inet->opt->srr) 184 if (!inet->opt || !inet->opt->srr)
184 daddr = rt->rt_dst; 185 daddr = rt->rt_dst;
185 186
186 if (!inet->saddr) 187 if (!inet->inet_saddr)
187 inet->saddr = rt->rt_src; 188 inet->inet_saddr = rt->rt_src;
188 inet->rcv_saddr = inet->saddr; 189 inet->inet_rcv_saddr = inet->inet_saddr;
189 190
190 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { 191 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191 /* Reset inherited state */ 192 /* Reset inherited state */
192 tp->rx_opt.ts_recent = 0; 193 tp->rx_opt.ts_recent = 0;
193 tp->rx_opt.ts_recent_stamp = 0; 194 tp->rx_opt.ts_recent_stamp = 0;
@@ -204,20 +205,20 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
204 * when trying new connection. 205 * when trying new connection.
205 */ 206 */
206 if (peer != NULL && 207 if (peer != NULL &&
207 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { 208 (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 209 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209 tp->rx_opt.ts_recent = peer->tcp_ts; 210 tp->rx_opt.ts_recent = peer->tcp_ts;
210 } 211 }
211 } 212 }
212 213
213 inet->dport = usin->sin_port; 214 inet->inet_dport = usin->sin_port;
214 inet->daddr = daddr; 215 inet->inet_daddr = daddr;
215 216
216 inet_csk(sk)->icsk_ext_hdr_len = 0; 217 inet_csk(sk)->icsk_ext_hdr_len = 0;
217 if (inet->opt) 218 if (inet->opt)
218 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 219 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219 220
220 tp->rx_opt.mss_clamp = 536; 221 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221 222
222 /* Socket identity is still unknown (sport may be zero). 223 /* Socket identity is still unknown (sport may be zero).
223 * However we set state to SYN-SENT and not releasing socket 224 * However we set state to SYN-SENT and not releasing socket
@@ -230,7 +231,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
230 goto failure; 231 goto failure;
231 232
232 err = ip_route_newports(&rt, IPPROTO_TCP, 233 err = ip_route_newports(&rt, IPPROTO_TCP,
233 inet->sport, inet->dport, sk); 234 inet->inet_sport, inet->inet_dport, sk);
234 if (err) 235 if (err)
235 goto failure; 236 goto failure;
236 237
@@ -239,12 +240,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
239 sk_setup_caps(sk, &rt->u.dst); 240 sk_setup_caps(sk, &rt->u.dst);
240 241
241 if (!tp->write_seq) 242 if (!tp->write_seq)
242 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 243 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243 inet->daddr, 244 inet->inet_daddr,
244 inet->sport, 245 inet->inet_sport,
245 usin->sin_port); 246 usin->sin_port);
246 247
247 inet->id = tp->write_seq ^ jiffies; 248 inet->inet_id = tp->write_seq ^ jiffies;
248 249
249 err = tcp_connect(sk); 250 err = tcp_connect(sk);
250 rt = NULL; 251 rt = NULL;
@@ -261,7 +262,7 @@ failure:
261 tcp_set_state(sk, TCP_CLOSE); 262 tcp_set_state(sk, TCP_CLOSE);
262 ip_rt_put(rt); 263 ip_rt_put(rt);
263 sk->sk_route_caps = 0; 264 sk->sk_route_caps = 0;
264 inet->dport = 0; 265 inet->inet_dport = 0;
265 return err; 266 return err;
266} 267}
267 268
@@ -370,6 +371,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
370 if (sk->sk_state == TCP_CLOSE) 371 if (sk->sk_state == TCP_CLOSE)
371 goto out; 372 goto out;
372 373
374 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
375 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
376 goto out;
377 }
378
373 icsk = inet_csk(sk); 379 icsk = inet_csk(sk);
374 tp = tcp_sk(sk); 380 tp = tcp_sk(sk);
375 seq = ntohl(th->seq); 381 seq = ntohl(th->seq);
@@ -520,12 +526,13 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
520 struct tcphdr *th = tcp_hdr(skb); 526 struct tcphdr *th = tcp_hdr(skb);
521 527
522 if (skb->ip_summed == CHECKSUM_PARTIAL) { 528 if (skb->ip_summed == CHECKSUM_PARTIAL) {
523 th->check = ~tcp_v4_check(len, inet->saddr, 529 th->check = ~tcp_v4_check(len, inet->inet_saddr,
524 inet->daddr, 0); 530 inet->inet_daddr, 0);
525 skb->csum_start = skb_transport_header(skb) - skb->head; 531 skb->csum_start = skb_transport_header(skb) - skb->head;
526 skb->csum_offset = offsetof(struct tcphdr, check); 532 skb->csum_offset = offsetof(struct tcphdr, check);
527 } else { 533 } else {
528 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 534 th->check = tcp_v4_check(len, inet->inet_saddr,
535 inet->inet_daddr,
529 csum_partial(th, 536 csum_partial(th,
530 th->doff << 2, 537 th->doff << 2,
531 skb->csum)); 538 skb->csum));
@@ -741,8 +748,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
741 * This still operates on a request_sock only, not on a big 748 * This still operates on a request_sock only, not on a big
742 * socket. 749 * socket.
743 */ 750 */
744static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, 751static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
745 struct dst_entry *dst) 752 struct request_sock *req,
753 struct request_values *rvp)
746{ 754{
747 const struct inet_request_sock *ireq = inet_rsk(req); 755 const struct inet_request_sock *ireq = inet_rsk(req);
748 int err = -1; 756 int err = -1;
@@ -752,7 +760,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
752 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 760 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
753 return -1; 761 return -1;
754 762
755 skb = tcp_make_synack(sk, dst, req); 763 skb = tcp_make_synack(sk, dst, req, rvp);
756 764
757 if (skb) { 765 if (skb) {
758 struct tcphdr *th = tcp_hdr(skb); 766 struct tcphdr *th = tcp_hdr(skb);
@@ -773,9 +781,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
773 return err; 781 return err;
774} 782}
775 783
776static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) 784static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
785 struct request_values *rvp)
777{ 786{
778 return __tcp_v4_send_synack(sk, req, NULL); 787 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
788 return tcp_v4_send_synack(sk, NULL, req, rvp);
779} 789}
780 790
781/* 791/*
@@ -848,7 +858,7 @@ static struct tcp_md5sig_key *
848struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 858struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
849 struct sock *addr_sk) 859 struct sock *addr_sk)
850{ 860{
851 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); 861 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
852} 862}
853 863
854EXPORT_SYMBOL(tcp_v4_md5_lookup); 864EXPORT_SYMBOL(tcp_v4_md5_lookup);
@@ -923,7 +933,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add);
923static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 933static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
924 u8 *newkey, u8 newkeylen) 934 u8 *newkey, u8 newkeylen)
925{ 935{
926 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, 936 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
927 newkey, newkeylen); 937 newkey, newkeylen);
928} 938}
929 939
@@ -1089,8 +1099,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1089 __be32 saddr, daddr; 1099 __be32 saddr, daddr;
1090 1100
1091 if (sk) { 1101 if (sk) {
1092 saddr = inet_sk(sk)->saddr; 1102 saddr = inet_sk(sk)->inet_saddr;
1093 daddr = inet_sk(sk)->daddr; 1103 daddr = inet_sk(sk)->inet_daddr;
1094 } else if (req) { 1104 } else if (req) {
1095 saddr = inet_rsk(req)->loc_addr; 1105 saddr = inet_rsk(req)->loc_addr;
1096 daddr = inet_rsk(req)->rmt_addr; 1106 daddr = inet_rsk(req)->rmt_addr;
@@ -1189,10 +1199,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1189struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1199struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1190 .family = PF_INET, 1200 .family = PF_INET,
1191 .obj_size = sizeof(struct tcp_request_sock), 1201 .obj_size = sizeof(struct tcp_request_sock),
1192 .rtx_syn_ack = tcp_v4_send_synack, 1202 .rtx_syn_ack = tcp_v4_rtx_synack,
1193 .send_ack = tcp_v4_reqsk_send_ack, 1203 .send_ack = tcp_v4_reqsk_send_ack,
1194 .destructor = tcp_v4_reqsk_destructor, 1204 .destructor = tcp_v4_reqsk_destructor,
1195 .send_reset = tcp_v4_send_reset, 1205 .send_reset = tcp_v4_send_reset,
1206 .syn_ack_timeout = tcp_syn_ack_timeout,
1196}; 1207};
1197 1208
1198#ifdef CONFIG_TCP_MD5SIG 1209#ifdef CONFIG_TCP_MD5SIG
@@ -1210,13 +1221,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
1210 1221
1211int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1222int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1212{ 1223{
1213 struct inet_request_sock *ireq; 1224 struct tcp_extend_values tmp_ext;
1214 struct tcp_options_received tmp_opt; 1225 struct tcp_options_received tmp_opt;
1226 u8 *hash_location;
1215 struct request_sock *req; 1227 struct request_sock *req;
1228 struct inet_request_sock *ireq;
1229 struct tcp_sock *tp = tcp_sk(sk);
1230 struct dst_entry *dst = NULL;
1216 __be32 saddr = ip_hdr(skb)->saddr; 1231 __be32 saddr = ip_hdr(skb)->saddr;
1217 __be32 daddr = ip_hdr(skb)->daddr; 1232 __be32 daddr = ip_hdr(skb)->daddr;
1218 __u32 isn = TCP_SKB_CB(skb)->when; 1233 __u32 isn = TCP_SKB_CB(skb)->when;
1219 struct dst_entry *dst = NULL;
1220#ifdef CONFIG_SYN_COOKIES 1234#ifdef CONFIG_SYN_COOKIES
1221 int want_cookie = 0; 1235 int want_cookie = 0;
1222#else 1236#else
@@ -1257,16 +1271,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1257#endif 1271#endif
1258 1272
1259 tcp_clear_options(&tmp_opt); 1273 tcp_clear_options(&tmp_opt);
1260 tmp_opt.mss_clamp = 536; 1274 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1261 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; 1275 tmp_opt.user_mss = tp->rx_opt.user_mss;
1276 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1277
1278 if (tmp_opt.cookie_plus > 0 &&
1279 tmp_opt.saw_tstamp &&
1280 !tp->rx_opt.cookie_out_never &&
1281 (sysctl_tcp_cookie_size > 0 ||
1282 (tp->cookie_values != NULL &&
1283 tp->cookie_values->cookie_desired > 0))) {
1284 u8 *c;
1285 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1286 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1287
1288 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1289 goto drop_and_release;
1290
1291 /* Secret recipe starts with IP addresses */
1292 *mess++ ^= daddr;
1293 *mess++ ^= saddr;
1262 1294
1263 tcp_parse_options(skb, &tmp_opt, 0); 1295 /* plus variable length Initiator Cookie */
1296 c = (u8 *)mess;
1297 while (l-- > 0)
1298 *c++ ^= *hash_location++;
1299
1300#ifdef CONFIG_SYN_COOKIES
1301 want_cookie = 0; /* not our kind of cookie */
1302#endif
1303 tmp_ext.cookie_out_never = 0; /* false */
1304 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1305 } else if (!tp->rx_opt.cookie_in_always) {
1306 /* redundant indications, but ensure initialization. */
1307 tmp_ext.cookie_out_never = 1; /* true */
1308 tmp_ext.cookie_plus = 0;
1309 } else {
1310 goto drop_and_release;
1311 }
1312 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1264 1313
1265 if (want_cookie && !tmp_opt.saw_tstamp) 1314 if (want_cookie && !tmp_opt.saw_tstamp)
1266 tcp_clear_options(&tmp_opt); 1315 tcp_clear_options(&tmp_opt);
1267 1316
1268 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1317 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1269
1270 tcp_openreq_init(req, &tmp_opt, skb); 1318 tcp_openreq_init(req, &tmp_opt, skb);
1271 1319
1272 ireq = inet_rsk(req); 1320 ireq = inet_rsk(req);
@@ -1304,7 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1304 (dst = inet_csk_route_req(sk, req)) != NULL && 1352 (dst = inet_csk_route_req(sk, req)) != NULL &&
1305 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1353 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1306 peer->v4daddr == saddr) { 1354 peer->v4daddr == saddr) {
1307 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && 1355 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1308 (s32)(peer->tcp_ts - req->ts_recent) > 1356 (s32)(peer->tcp_ts - req->ts_recent) >
1309 TCP_PAWS_WINDOW) { 1357 TCP_PAWS_WINDOW) {
1310 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1358 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1333,7 +1381,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1333 } 1381 }
1334 tcp_rsk(req)->snt_isn = isn; 1382 tcp_rsk(req)->snt_isn = isn;
1335 1383
1336 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) 1384 if (tcp_v4_send_synack(sk, dst, req,
1385 (struct request_values *)&tmp_ext) ||
1386 want_cookie)
1337 goto drop_and_free; 1387 goto drop_and_free;
1338 1388
1339 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1389 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1380,9 +1430,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1380 newtp = tcp_sk(newsk); 1430 newtp = tcp_sk(newsk);
1381 newinet = inet_sk(newsk); 1431 newinet = inet_sk(newsk);
1382 ireq = inet_rsk(req); 1432 ireq = inet_rsk(req);
1383 newinet->daddr = ireq->rmt_addr; 1433 newinet->inet_daddr = ireq->rmt_addr;
1384 newinet->rcv_saddr = ireq->loc_addr; 1434 newinet->inet_rcv_saddr = ireq->loc_addr;
1385 newinet->saddr = ireq->loc_addr; 1435 newinet->inet_saddr = ireq->loc_addr;
1386 newinet->opt = ireq->opt; 1436 newinet->opt = ireq->opt;
1387 ireq->opt = NULL; 1437 ireq->opt = NULL;
1388 newinet->mc_index = inet_iif(skb); 1438 newinet->mc_index = inet_iif(skb);
@@ -1390,7 +1440,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1390 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1440 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1391 if (newinet->opt) 1441 if (newinet->opt)
1392 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1442 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1393 newinet->id = newtp->write_seq ^ jiffies; 1443 newinet->inet_id = newtp->write_seq ^ jiffies;
1394 1444
1395 tcp_mtup_init(newsk); 1445 tcp_mtup_init(newsk);
1396 tcp_sync_mss(newsk, dst_mtu(dst)); 1446 tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1403,7 +1453,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1403 1453
1404#ifdef CONFIG_TCP_MD5SIG 1454#ifdef CONFIG_TCP_MD5SIG
1405 /* Copy over the MD5 key from the original socket */ 1455 /* Copy over the MD5 key from the original socket */
1406 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { 1456 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1457 if (key != NULL) {
1407 /* 1458 /*
1408 * We're using one, so create a matching key 1459 * We're using one, so create a matching key
1409 * on the newsk structure. If we fail to get 1460 * on the newsk structure. If we fail to get
@@ -1412,13 +1463,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1412 */ 1463 */
1413 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); 1464 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1414 if (newkey != NULL) 1465 if (newkey != NULL)
1415 tcp_v4_md5_do_add(newsk, newinet->daddr, 1466 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1416 newkey, key->keylen); 1467 newkey, key->keylen);
1417 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1468 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1418 } 1469 }
1419#endif 1470#endif
1420 1471
1421 __inet_hash_nolisten(newsk); 1472 __inet_hash_nolisten(newsk, NULL);
1422 __inet_inherit_port(sk, newsk); 1473 __inet_inherit_port(sk, newsk);
1423 1474
1424 return newsk; 1475 return newsk;
@@ -1610,6 +1661,11 @@ process:
1610 if (sk->sk_state == TCP_TIME_WAIT) 1661 if (sk->sk_state == TCP_TIME_WAIT)
1611 goto do_time_wait; 1662 goto do_time_wait;
1612 1663
1664 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1665 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1666 goto discard_and_relse;
1667 }
1668
1613 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1669 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1614 goto discard_and_relse; 1670 goto discard_and_relse;
1615 nf_reset(skb); 1671 nf_reset(skb);
@@ -1634,8 +1690,11 @@ process:
1634 if (!tcp_prequeue(sk, skb)) 1690 if (!tcp_prequeue(sk, skb))
1635 ret = tcp_v4_do_rcv(sk, skb); 1691 ret = tcp_v4_do_rcv(sk, skb);
1636 } 1692 }
1637 } else 1693 } else if (unlikely(sk_add_backlog(sk, skb))) {
1638 sk_add_backlog(sk, skb); 1694 bh_unlock_sock(sk);
1695 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1696 goto discard_and_relse;
1697 }
1639 bh_unlock_sock(sk); 1698 bh_unlock_sock(sk);
1640 1699
1641 sock_put(sk); 1700 sock_put(sk);
@@ -1711,8 +1770,8 @@ int tcp_v4_remember_stamp(struct sock *sk)
1711 struct inet_peer *peer = NULL; 1770 struct inet_peer *peer = NULL;
1712 int release_it = 0; 1771 int release_it = 0;
1713 1772
1714 if (!rt || rt->rt_dst != inet->daddr) { 1773 if (!rt || rt->rt_dst != inet->inet_daddr) {
1715 peer = inet_getpeer(inet->daddr, 1); 1774 peer = inet_getpeer(inet->inet_daddr, 1);
1716 release_it = 1; 1775 release_it = 1;
1717 } else { 1776 } else {
1718 if (!rt->peer) 1777 if (!rt->peer)
@@ -1722,9 +1781,9 @@ int tcp_v4_remember_stamp(struct sock *sk)
1722 1781
1723 if (peer) { 1782 if (peer) {
1724 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || 1783 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1725 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1784 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1726 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { 1785 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1727 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; 1786 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1728 peer->tcp_ts = tp->rx_opt.ts_recent; 1787 peer->tcp_ts = tp->rx_opt.ts_recent;
1729 } 1788 }
1730 if (release_it) 1789 if (release_it)
@@ -1743,9 +1802,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1743 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 1802 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1744 1803
1745 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || 1804 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1746 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1805 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1747 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { 1806 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1748 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; 1807 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1749 peer->tcp_ts = tcptw->tw_ts_recent; 1808 peer->tcp_ts = tcptw->tw_ts_recent;
1750 } 1809 }
1751 inet_putpeer(peer); 1810 inet_putpeer(peer);
@@ -1810,7 +1869,7 @@ static int tcp_v4_init_sock(struct sock *sk)
1810 */ 1869 */
1811 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 1870 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1812 tp->snd_cwnd_clamp = ~0; 1871 tp->snd_cwnd_clamp = ~0;
1813 tp->mss_cache = 536; 1872 tp->mss_cache = TCP_MSS_DEFAULT;
1814 1873
1815 tp->reordering = sysctl_tcp_reordering; 1874 tp->reordering = sysctl_tcp_reordering;
1816 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1875 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1826,6 +1885,19 @@ static int tcp_v4_init_sock(struct sock *sk)
1826 tp->af_specific = &tcp_sock_ipv4_specific; 1885 tp->af_specific = &tcp_sock_ipv4_specific;
1827#endif 1886#endif
1828 1887
1888 /* TCP Cookie Transactions */
1889 if (sysctl_tcp_cookie_size > 0) {
1890 /* Default, cookies without s_data_payload. */
1891 tp->cookie_values =
1892 kzalloc(sizeof(*tp->cookie_values),
1893 sk->sk_allocation);
1894 if (tp->cookie_values != NULL)
1895 kref_init(&tp->cookie_values->kref);
1896 }
1897 /* Presumed zeroed, in order of appearance:
1898 * cookie_in_always, cookie_out_never,
1899 * s_data_constant, s_data_in, s_data_out
1900 */
1829 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1901 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1830 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1902 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1831 1903
@@ -1879,6 +1951,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
1879 sk->sk_sndmsg_page = NULL; 1951 sk->sk_sndmsg_page = NULL;
1880 } 1952 }
1881 1953
1954 /* TCP Cookie Transactions */
1955 if (tp->cookie_values != NULL) {
1956 kref_put(&tp->cookie_values->kref,
1957 tcp_cookie_values_release);
1958 tp->cookie_values = NULL;
1959 }
1960
1882 percpu_counter_dec(&tcp_sockets_allocated); 1961 percpu_counter_dec(&tcp_sockets_allocated);
1883} 1962}
1884 1963
@@ -2000,7 +2079,7 @@ static void *established_get_first(struct seq_file *seq)
2000 struct net *net = seq_file_net(seq); 2079 struct net *net = seq_file_net(seq);
2001 void *rc = NULL; 2080 void *rc = NULL;
2002 2081
2003 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 2082 for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2004 struct sock *sk; 2083 struct sock *sk;
2005 struct hlist_nulls_node *node; 2084 struct hlist_nulls_node *node;
2006 struct inet_timewait_sock *tw; 2085 struct inet_timewait_sock *tw;
@@ -2061,10 +2140,10 @@ get_tw:
2061 st->state = TCP_SEQ_STATE_ESTABLISHED; 2140 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062 2141
2063 /* Look for next non empty bucket */ 2142 /* Look for next non empty bucket */
2064 while (++st->bucket < tcp_hashinfo.ehash_size && 2143 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2065 empty_bucket(st)) 2144 empty_bucket(st))
2066 ; 2145 ;
2067 if (st->bucket >= tcp_hashinfo.ehash_size) 2146 if (st->bucket > tcp_hashinfo.ehash_mask)
2068 return NULL; 2147 return NULL;
2069 2148
2070 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2149 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2225,7 +2304,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
2225 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", 2304 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2226 i, 2305 i,
2227 ireq->loc_addr, 2306 ireq->loc_addr,
2228 ntohs(inet_sk(sk)->sport), 2307 ntohs(inet_sk(sk)->inet_sport),
2229 ireq->rmt_addr, 2308 ireq->rmt_addr,
2230 ntohs(ireq->rmt_port), 2309 ntohs(ireq->rmt_port),
2231 TCP_SYN_RECV, 2310 TCP_SYN_RECV,
@@ -2248,10 +2327,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2248 struct tcp_sock *tp = tcp_sk(sk); 2327 struct tcp_sock *tp = tcp_sk(sk);
2249 const struct inet_connection_sock *icsk = inet_csk(sk); 2328 const struct inet_connection_sock *icsk = inet_csk(sk);
2250 struct inet_sock *inet = inet_sk(sk); 2329 struct inet_sock *inet = inet_sk(sk);
2251 __be32 dest = inet->daddr; 2330 __be32 dest = inet->inet_daddr;
2252 __be32 src = inet->rcv_saddr; 2331 __be32 src = inet->inet_rcv_saddr;
2253 __u16 destp = ntohs(inet->dport); 2332 __u16 destp = ntohs(inet->inet_dport);
2254 __u16 srcp = ntohs(inet->sport); 2333 __u16 srcp = ntohs(inet->inet_sport);
2334 int rx_queue;
2255 2335
2256 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2336 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2257 timer_active = 1; 2337 timer_active = 1;
@@ -2267,12 +2347,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2267 timer_expires = jiffies; 2347 timer_expires = jiffies;
2268 } 2348 }
2269 2349
2350 if (sk->sk_state == TCP_LISTEN)
2351 rx_queue = sk->sk_ack_backlog;
2352 else
2353 /*
2354 * because we dont lock socket, we might find a transient negative value
2355 */
2356 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2357
2270 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2358 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2271 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", 2359 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2272 i, src, srcp, dest, destp, sk->sk_state, 2360 i, src, srcp, dest, destp, sk->sk_state,
2273 tp->write_seq - tp->snd_una, 2361 tp->write_seq - tp->snd_una,
2274 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : 2362 rx_queue,
2275 (tp->rcv_nxt - tp->copied_seq),
2276 timer_active, 2363 timer_active,
2277 jiffies_to_clock_t(timer_expires - jiffies), 2364 jiffies_to_clock_t(timer_expires - jiffies),
2278 icsk->icsk_retransmits, 2365 icsk->icsk_retransmits,
@@ -2354,12 +2441,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2354 }, 2441 },
2355}; 2442};
2356 2443
2357static int tcp4_proc_init_net(struct net *net) 2444static int __net_init tcp4_proc_init_net(struct net *net)
2358{ 2445{
2359 return tcp_proc_register(net, &tcp4_seq_afinfo); 2446 return tcp_proc_register(net, &tcp4_seq_afinfo);
2360} 2447}
2361 2448
2362static void tcp4_proc_exit_net(struct net *net) 2449static void __net_exit tcp4_proc_exit_net(struct net *net)
2363{ 2450{
2364 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2451 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2365} 2452}
@@ -2463,12 +2550,17 @@ static int __net_init tcp_sk_init(struct net *net)
2463static void __net_exit tcp_sk_exit(struct net *net) 2550static void __net_exit tcp_sk_exit(struct net *net)
2464{ 2551{
2465 inet_ctl_sock_destroy(net->ipv4.tcp_sock); 2552 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2466 inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); 2553}
2554
2555static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2556{
2557 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2467} 2558}
2468 2559
2469static struct pernet_operations __net_initdata tcp_sk_ops = { 2560static struct pernet_operations __net_initdata tcp_sk_ops = {
2470 .init = tcp_sk_init, 2561 .init = tcp_sk_init,
2471 .exit = tcp_sk_exit, 2562 .exit = tcp_sk_exit,
2563 .exit_batch = tcp_sk_exit_batch,
2472}; 2564};
2473 2565
2474void __init tcp_v4_init(void) 2566void __init tcp_v4_init(void)
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ce3c41ff50b2..de870377fbba 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
143 goto out; 143 goto out;
144 144
145 /* we can't calc remote HZ with no different!! */ 145 /* we can't calc remote HZ with no different!! */
146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time 146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
147 || tp->rx_opt.rcv_tsecr == lp->local_ref_time) 147 tp->rx_opt.rcv_tsecr == lp->local_ref_time)
148 goto out; 148 goto out;
149 149
150 m = HZ * (tp->rx_opt.rcv_tsval - 150 m = HZ * (tp->rx_opt.rcv_tsval -
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4c03598ed924..5fabff9ac6d6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,19 +20,14 @@
20 20
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h>
23#include <linux/sysctl.h> 24#include <linux/sysctl.h>
24#include <linux/workqueue.h> 25#include <linux/workqueue.h>
25#include <net/tcp.h> 26#include <net/tcp.h>
26#include <net/inet_common.h> 27#include <net/inet_common.h>
27#include <net/xfrm.h> 28#include <net/xfrm.h>
28 29
29#ifdef CONFIG_SYSCTL 30int sysctl_tcp_syncookies __read_mostly = 1;
30#define SYNC_INIT 0 /* let the user enable it */
31#else
32#define SYNC_INIT 1
33#endif
34
35int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
36EXPORT_SYMBOL(sysctl_tcp_syncookies); 31EXPORT_SYMBOL(sysctl_tcp_syncookies);
37 32
38int sysctl_tcp_abort_on_overflow __read_mostly; 33int sysctl_tcp_abort_on_overflow __read_mostly;
@@ -96,13 +91,14 @@ enum tcp_tw_status
96tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 91tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
97 const struct tcphdr *th) 92 const struct tcphdr *th)
98{ 93{
99 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
100 struct tcp_options_received tmp_opt; 94 struct tcp_options_received tmp_opt;
95 u8 *hash_location;
96 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
101 int paws_reject = 0; 97 int paws_reject = 0;
102 98
103 tmp_opt.saw_tstamp = 0; 99 tmp_opt.saw_tstamp = 0;
104 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 100 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
105 tcp_parse_options(skb, &tmp_opt, 0); 101 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
106 102
107 if (tmp_opt.saw_tstamp) { 103 if (tmp_opt.saw_tstamp) {
108 tmp_opt.ts_recent = tcptw->tw_ts_recent; 104 tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -389,14 +385,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
389 const struct inet_request_sock *ireq = inet_rsk(req); 385 const struct inet_request_sock *ireq = inet_rsk(req);
390 struct tcp_request_sock *treq = tcp_rsk(req); 386 struct tcp_request_sock *treq = tcp_rsk(req);
391 struct inet_connection_sock *newicsk = inet_csk(newsk); 387 struct inet_connection_sock *newicsk = inet_csk(newsk);
392 struct tcp_sock *newtp; 388 struct tcp_sock *newtp = tcp_sk(newsk);
389 struct tcp_sock *oldtp = tcp_sk(sk);
390 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
391
392 /* TCP Cookie Transactions require space for the cookie pair,
393 * as it differs for each connection. There is no need to
394 * copy any s_data_payload stored at the original socket.
395 * Failure will prevent resuming the connection.
396 *
397 * Presumed copied, in order of appearance:
398 * cookie_in_always, cookie_out_never
399 */
400 if (oldcvp != NULL) {
401 struct tcp_cookie_values *newcvp =
402 kzalloc(sizeof(*newtp->cookie_values),
403 GFP_ATOMIC);
404
405 if (newcvp != NULL) {
406 kref_init(&newcvp->kref);
407 newcvp->cookie_desired =
408 oldcvp->cookie_desired;
409 newtp->cookie_values = newcvp;
410 } else {
411 /* Not Yet Implemented */
412 newtp->cookie_values = NULL;
413 }
414 }
393 415
394 /* Now setup tcp_sock */ 416 /* Now setup tcp_sock */
395 newtp = tcp_sk(newsk);
396 newtp->pred_flags = 0; 417 newtp->pred_flags = 0;
397 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; 418
398 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; 419 newtp->rcv_wup = newtp->copied_seq =
399 newtp->snd_up = treq->snt_isn + 1; 420 newtp->rcv_nxt = treq->rcv_isn + 1;
421
422 newtp->snd_sml = newtp->snd_una =
423 newtp->snd_nxt = newtp->snd_up =
424 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
400 425
401 tcp_prequeue_init(newtp); 426 tcp_prequeue_init(newtp);
402 427
@@ -429,8 +454,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
429 tcp_set_ca_state(newsk, TCP_CA_Open); 454 tcp_set_ca_state(newsk, TCP_CA_Open);
430 tcp_init_xmit_timers(newsk); 455 tcp_init_xmit_timers(newsk);
431 skb_queue_head_init(&newtp->out_of_order_queue); 456 skb_queue_head_init(&newtp->out_of_order_queue);
432 newtp->write_seq = treq->snt_isn + 1; 457 newtp->write_seq = newtp->pushed_seq =
433 newtp->pushed_seq = newtp->write_seq; 458 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
434 459
435 newtp->rx_opt.saw_tstamp = 0; 460 newtp->rx_opt.saw_tstamp = 0;
436 461
@@ -476,7 +501,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
476 if (newtp->af_specific->md5_lookup(sk, newsk)) 501 if (newtp->af_specific->md5_lookup(sk, newsk))
477 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 502 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
478#endif 503#endif
479 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 504 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
480 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 505 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
481 newtp->rx_opt.mss_clamp = req->mss; 506 newtp->rx_opt.mss_clamp = req->mss;
482 TCP_ECN_openreq_child(newtp, req); 507 TCP_ECN_openreq_child(newtp, req);
@@ -495,15 +520,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
495 struct request_sock *req, 520 struct request_sock *req,
496 struct request_sock **prev) 521 struct request_sock **prev)
497{ 522{
523 struct tcp_options_received tmp_opt;
524 u8 *hash_location;
525 struct sock *child;
498 const struct tcphdr *th = tcp_hdr(skb); 526 const struct tcphdr *th = tcp_hdr(skb);
499 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 527 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
500 int paws_reject = 0; 528 int paws_reject = 0;
501 struct tcp_options_received tmp_opt;
502 struct sock *child;
503 529
504 tmp_opt.saw_tstamp = 0; 530 tmp_opt.saw_tstamp = 0;
505 if (th->doff > (sizeof(struct tcphdr)>>2)) { 531 if (th->doff > (sizeof(struct tcphdr)>>2)) {
506 tcp_parse_options(skb, &tmp_opt, 0); 532 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
507 533
508 if (tmp_opt.saw_tstamp) { 534 if (tmp_opt.saw_tstamp) {
509 tmp_opt.ts_recent = req->ts_recent; 535 tmp_opt.ts_recent = req->ts_recent;
@@ -537,7 +563,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
537 * Enforce "SYN-ACK" according to figure 8, figure 6 563 * Enforce "SYN-ACK" according to figure 8, figure 6
538 * of RFC793, fixed by RFC1122. 564 * of RFC793, fixed by RFC1122.
539 */ 565 */
540 req->rsk_ops->rtx_syn_ack(sk, req); 566 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
541 return NULL; 567 return NULL;
542 } 568 }
543 569
@@ -596,7 +622,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
596 * Invalid ACK: reset will be sent by listening socket 622 * Invalid ACK: reset will be sent by listening socket
597 */ 623 */
598 if ((flg & TCP_FLAG_ACK) && 624 if ((flg & TCP_FLAG_ACK) &&
599 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) 625 (TCP_SKB_CB(skb)->ack_seq !=
626 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
600 return sk; 627 return sk;
601 628
602 /* Also, it would be not so bad idea to check rcv_tsecr, which 629 /* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -702,7 +729,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
702 * in main socket hash table and lock on listening 729 * in main socket hash table and lock on listening
703 * socket does not protect us more. 730 * socket does not protect us more.
704 */ 731 */
705 sk_add_backlog(child, skb); 732 __sk_add_backlog(child, skb);
706 } 733 }
707 734
708 bh_unlock_sock(child); 735 bh_unlock_sock(child);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a7080e..0dda86e72ad8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
37#include <net/tcp.h> 37#include <net/tcp.h>
38 38
39#include <linux/compiler.h> 39#include <linux/compiler.h>
40#include <linux/gfp.h>
40#include <linux/module.h> 41#include <linux/module.h>
41 42
42/* People can turn this off for buggy TCP's found in printers etc. */ 43/* People can turn this off for buggy TCP's found in printers etc. */
@@ -59,6 +60,10 @@ int sysctl_tcp_base_mss __read_mostly = 512;
59/* By default, RFC2861 behavior. */ 60/* By default, RFC2861 behavior. */
60int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
61 62
63int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
64EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
65
66
62/* Account for new data that has been sent to the network. */ 67/* Account for new data that has been sent to the network. */
63static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) 68static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
64{ 69{
@@ -179,7 +184,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
179 */ 184 */
180void tcp_select_initial_window(int __space, __u32 mss, 185void tcp_select_initial_window(int __space, __u32 mss,
181 __u32 *rcv_wnd, __u32 *window_clamp, 186 __u32 *rcv_wnd, __u32 *window_clamp,
182 int wscale_ok, __u8 *rcv_wscale) 187 int wscale_ok, __u8 *rcv_wscale,
188 __u32 init_rcv_wnd)
183{ 189{
184 unsigned int space = (__space < 0 ? 0 : __space); 190 unsigned int space = (__space < 0 ? 0 : __space);
185 191
@@ -228,7 +234,13 @@ void tcp_select_initial_window(int __space, __u32 mss,
228 init_cwnd = 2; 234 init_cwnd = 2;
229 else if (mss > 1460) 235 else if (mss > 1460)
230 init_cwnd = 3; 236 init_cwnd = 3;
231 if (*rcv_wnd > init_cwnd * mss) 237 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above
239 */
240 if (init_rcv_wnd &&
241 (*rcv_wnd > init_rcv_wnd * mss))
242 *rcv_wnd = init_rcv_wnd * mss;
243 else if (*rcv_wnd > init_cwnd * mss)
232 *rcv_wnd = init_cwnd * mss; 244 *rcv_wnd = init_cwnd * mss;
233 } 245 }
234 246
@@ -362,15 +374,45 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
362#define OPTION_TS (1 << 1) 374#define OPTION_TS (1 << 1)
363#define OPTION_MD5 (1 << 2) 375#define OPTION_MD5 (1 << 2)
364#define OPTION_WSCALE (1 << 3) 376#define OPTION_WSCALE (1 << 3)
377#define OPTION_COOKIE_EXTENSION (1 << 4)
365 378
366struct tcp_out_options { 379struct tcp_out_options {
367 u8 options; /* bit field of OPTION_* */ 380 u8 options; /* bit field of OPTION_* */
368 u8 ws; /* window scale, 0 to disable */ 381 u8 ws; /* window scale, 0 to disable */
369 u8 num_sack_blocks; /* number of SACK blocks to include */ 382 u8 num_sack_blocks; /* number of SACK blocks to include */
383 u8 hash_size; /* bytes in hash_location */
370 u16 mss; /* 0 to disable */ 384 u16 mss; /* 0 to disable */
371 __u32 tsval, tsecr; /* need to include OPTION_TS */ 385 __u32 tsval, tsecr; /* need to include OPTION_TS */
386 __u8 *hash_location; /* temporary pointer, overloaded */
372}; 387};
373 388
389/* The sysctl int routines are generic, so check consistency here.
390 */
391static u8 tcp_cookie_size_check(u8 desired)
392{
393 if (desired > 0) {
394 /* previously specified */
395 return desired;
396 }
397 if (sysctl_tcp_cookie_size <= 0) {
398 /* no default specified */
399 return 0;
400 }
401 if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
402 /* value too small, specify minimum */
403 return TCP_COOKIE_MIN;
404 }
405 if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
406 /* value too large, specify maximum */
407 return TCP_COOKIE_MAX;
408 }
409 if (0x1 & sysctl_tcp_cookie_size) {
410 /* 8-bit multiple, illegal, fix it */
411 return (u8)(sysctl_tcp_cookie_size + 0x1);
412 }
413 return (u8)sysctl_tcp_cookie_size;
414}
415
374/* Write previously computed TCP options to the packet. 416/* Write previously computed TCP options to the packet.
375 * 417 *
376 * Beware: Something in the Internet is very sensitive to the ordering of 418 * Beware: Something in the Internet is very sensitive to the ordering of
@@ -385,17 +427,34 @@ struct tcp_out_options {
385 * (but it may well be that other scenarios fail similarly). 427 * (but it may well be that other scenarios fail similarly).
386 */ 428 */
387static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 429static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
388 const struct tcp_out_options *opts, 430 struct tcp_out_options *opts)
389 __u8 **md5_hash) { 431{
390 if (unlikely(OPTION_MD5 & opts->options)) { 432 u8 options = opts->options; /* mungable copy */
391 *ptr++ = htonl((TCPOPT_NOP << 24) | 433
392 (TCPOPT_NOP << 16) | 434 /* Having both authentication and cookies for security is redundant,
393 (TCPOPT_MD5SIG << 8) | 435 * and there's certainly not enough room. Instead, the cookie-less
394 TCPOLEN_MD5SIG); 436 * extension variant is proposed.
395 *md5_hash = (__u8 *)ptr; 437 *
438 * Consider the pessimal case with authentication. The options
439 * could look like:
440 * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
441 */
442 if (unlikely(OPTION_MD5 & options)) {
443 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
444 *ptr++ = htonl((TCPOPT_COOKIE << 24) |
445 (TCPOLEN_COOKIE_BASE << 16) |
446 (TCPOPT_MD5SIG << 8) |
447 TCPOLEN_MD5SIG);
448 } else {
449 *ptr++ = htonl((TCPOPT_NOP << 24) |
450 (TCPOPT_NOP << 16) |
451 (TCPOPT_MD5SIG << 8) |
452 TCPOLEN_MD5SIG);
453 }
454 options &= ~OPTION_COOKIE_EXTENSION;
455 /* overload cookie hash location */
456 opts->hash_location = (__u8 *)ptr;
396 ptr += 4; 457 ptr += 4;
397 } else {
398 *md5_hash = NULL;
399 } 458 }
400 459
401 if (unlikely(opts->mss)) { 460 if (unlikely(opts->mss)) {
@@ -404,12 +463,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
404 opts->mss); 463 opts->mss);
405 } 464 }
406 465
407 if (likely(OPTION_TS & opts->options)) { 466 if (likely(OPTION_TS & options)) {
408 if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { 467 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
409 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | 468 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
410 (TCPOLEN_SACK_PERM << 16) | 469 (TCPOLEN_SACK_PERM << 16) |
411 (TCPOPT_TIMESTAMP << 8) | 470 (TCPOPT_TIMESTAMP << 8) |
412 TCPOLEN_TIMESTAMP); 471 TCPOLEN_TIMESTAMP);
472 options &= ~OPTION_SACK_ADVERTISE;
413 } else { 473 } else {
414 *ptr++ = htonl((TCPOPT_NOP << 24) | 474 *ptr++ = htonl((TCPOPT_NOP << 24) |
415 (TCPOPT_NOP << 16) | 475 (TCPOPT_NOP << 16) |
@@ -420,15 +480,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
420 *ptr++ = htonl(opts->tsecr); 480 *ptr++ = htonl(opts->tsecr);
421 } 481 }
422 482
423 if (unlikely(OPTION_SACK_ADVERTISE & opts->options && 483 /* Specification requires after timestamp, so do it now.
424 !(OPTION_TS & opts->options))) { 484 *
485 * Consider the pessimal case without authentication. The options
486 * could look like:
487 * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
488 */
489 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
490 __u8 *cookie_copy = opts->hash_location;
491 u8 cookie_size = opts->hash_size;
492
493 /* 8-bit multiple handled in tcp_cookie_size_check() above,
494 * and elsewhere.
495 */
496 if (0x2 & cookie_size) {
497 __u8 *p = (__u8 *)ptr;
498
499 /* 16-bit multiple */
500 *p++ = TCPOPT_COOKIE;
501 *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
502 *p++ = *cookie_copy++;
503 *p++ = *cookie_copy++;
504 ptr++;
505 cookie_size -= 2;
506 } else {
507 /* 32-bit multiple */
508 *ptr++ = htonl(((TCPOPT_NOP << 24) |
509 (TCPOPT_NOP << 16) |
510 (TCPOPT_COOKIE << 8) |
511 TCPOLEN_COOKIE_BASE) +
512 cookie_size);
513 }
514
515 if (cookie_size > 0) {
516 memcpy(ptr, cookie_copy, cookie_size);
517 ptr += (cookie_size / 4);
518 }
519 }
520
521 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
425 *ptr++ = htonl((TCPOPT_NOP << 24) | 522 *ptr++ = htonl((TCPOPT_NOP << 24) |
426 (TCPOPT_NOP << 16) | 523 (TCPOPT_NOP << 16) |
427 (TCPOPT_SACK_PERM << 8) | 524 (TCPOPT_SACK_PERM << 8) |
428 TCPOLEN_SACK_PERM); 525 TCPOLEN_SACK_PERM);
429 } 526 }
430 527
431 if (unlikely(OPTION_WSCALE & opts->options)) { 528 if (unlikely(OPTION_WSCALE & options)) {
432 *ptr++ = htonl((TCPOPT_NOP << 24) | 529 *ptr++ = htonl((TCPOPT_NOP << 24) |
433 (TCPOPT_WINDOW << 16) | 530 (TCPOPT_WINDOW << 16) |
434 (TCPOLEN_WINDOW << 8) | 531 (TCPOLEN_WINDOW << 8) |
@@ -463,13 +560,17 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
463 struct tcp_out_options *opts, 560 struct tcp_out_options *opts,
464 struct tcp_md5sig_key **md5) { 561 struct tcp_md5sig_key **md5) {
465 struct tcp_sock *tp = tcp_sk(sk); 562 struct tcp_sock *tp = tcp_sk(sk);
466 unsigned size = 0; 563 struct tcp_cookie_values *cvp = tp->cookie_values;
564 unsigned remaining = MAX_TCP_OPTION_SPACE;
565 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
566 tcp_cookie_size_check(cvp->cookie_desired) :
567 0;
467 568
468#ifdef CONFIG_TCP_MD5SIG 569#ifdef CONFIG_TCP_MD5SIG
469 *md5 = tp->af_specific->md5_lookup(sk, sk); 570 *md5 = tp->af_specific->md5_lookup(sk, sk);
470 if (*md5) { 571 if (*md5) {
471 opts->options |= OPTION_MD5; 572 opts->options |= OPTION_MD5;
472 size += TCPOLEN_MD5SIG_ALIGNED; 573 remaining -= TCPOLEN_MD5SIG_ALIGNED;
473 } 574 }
474#else 575#else
475 *md5 = NULL; 576 *md5 = NULL;
@@ -485,26 +586,72 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
485 * SACKs don't matter, we never delay an ACK when we have any of those 586 * SACKs don't matter, we never delay an ACK when we have any of those
486 * going out. */ 587 * going out. */
487 opts->mss = tcp_advertise_mss(sk); 588 opts->mss = tcp_advertise_mss(sk);
488 size += TCPOLEN_MSS_ALIGNED; 589 remaining -= TCPOLEN_MSS_ALIGNED;
489 590
490 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 591 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
491 opts->options |= OPTION_TS; 592 opts->options |= OPTION_TS;
492 opts->tsval = TCP_SKB_CB(skb)->when; 593 opts->tsval = TCP_SKB_CB(skb)->when;
493 opts->tsecr = tp->rx_opt.ts_recent; 594 opts->tsecr = tp->rx_opt.ts_recent;
494 size += TCPOLEN_TSTAMP_ALIGNED; 595 remaining -= TCPOLEN_TSTAMP_ALIGNED;
495 } 596 }
496 if (likely(sysctl_tcp_window_scaling)) { 597 if (likely(sysctl_tcp_window_scaling)) {
497 opts->ws = tp->rx_opt.rcv_wscale; 598 opts->ws = tp->rx_opt.rcv_wscale;
498 opts->options |= OPTION_WSCALE; 599 opts->options |= OPTION_WSCALE;
499 size += TCPOLEN_WSCALE_ALIGNED; 600 remaining -= TCPOLEN_WSCALE_ALIGNED;
500 } 601 }
501 if (likely(sysctl_tcp_sack)) { 602 if (likely(sysctl_tcp_sack)) {
502 opts->options |= OPTION_SACK_ADVERTISE; 603 opts->options |= OPTION_SACK_ADVERTISE;
503 if (unlikely(!(OPTION_TS & opts->options))) 604 if (unlikely(!(OPTION_TS & opts->options)))
504 size += TCPOLEN_SACKPERM_ALIGNED; 605 remaining -= TCPOLEN_SACKPERM_ALIGNED;
505 } 606 }
506 607
507 return size; 608 /* Note that timestamps are required by the specification.
609 *
610 * Odd numbers of bytes are prohibited by the specification, ensuring
611 * that the cookie is 16-bit aligned, and the resulting cookie pair is
612 * 32-bit aligned.
613 */
614 if (*md5 == NULL &&
615 (OPTION_TS & opts->options) &&
616 cookie_size > 0) {
617 int need = TCPOLEN_COOKIE_BASE + cookie_size;
618
619 if (0x2 & need) {
620 /* 32-bit multiple */
621 need += 2; /* NOPs */
622
623 if (need > remaining) {
624 /* try shrinking cookie to fit */
625 cookie_size -= 2;
626 need -= 4;
627 }
628 }
629 while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
630 cookie_size -= 4;
631 need -= 4;
632 }
633 if (TCP_COOKIE_MIN <= cookie_size) {
634 opts->options |= OPTION_COOKIE_EXTENSION;
635 opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
636 opts->hash_size = cookie_size;
637
638 /* Remember for future incarnations. */
639 cvp->cookie_desired = cookie_size;
640
641 if (cvp->cookie_desired != cvp->cookie_pair_size) {
642 /* Currently use random bytes as a nonce,
643 * assuming these are completely unpredictable
644 * by hostile users of the same system.
645 */
646 get_random_bytes(&cvp->cookie_pair[0],
647 cookie_size);
648 cvp->cookie_pair_size = cookie_size;
649 }
650
651 remaining -= need;
652 }
653 }
654 return MAX_TCP_OPTION_SPACE - remaining;
508} 655}
509 656
510/* Set up TCP options for SYN-ACKs. */ 657/* Set up TCP options for SYN-ACKs. */
@@ -512,48 +659,77 @@ static unsigned tcp_synack_options(struct sock *sk,
512 struct request_sock *req, 659 struct request_sock *req,
513 unsigned mss, struct sk_buff *skb, 660 unsigned mss, struct sk_buff *skb,
514 struct tcp_out_options *opts, 661 struct tcp_out_options *opts,
515 struct tcp_md5sig_key **md5) { 662 struct tcp_md5sig_key **md5,
516 unsigned size = 0; 663 struct tcp_extend_values *xvp)
664{
517 struct inet_request_sock *ireq = inet_rsk(req); 665 struct inet_request_sock *ireq = inet_rsk(req);
518 char doing_ts; 666 unsigned remaining = MAX_TCP_OPTION_SPACE;
667 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
668 xvp->cookie_plus :
669 0;
670 bool doing_ts = ireq->tstamp_ok;
519 671
520#ifdef CONFIG_TCP_MD5SIG 672#ifdef CONFIG_TCP_MD5SIG
521 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 673 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
522 if (*md5) { 674 if (*md5) {
523 opts->options |= OPTION_MD5; 675 opts->options |= OPTION_MD5;
524 size += TCPOLEN_MD5SIG_ALIGNED; 676 remaining -= TCPOLEN_MD5SIG_ALIGNED;
677
678 /* We can't fit any SACK blocks in a packet with MD5 + TS
679 * options. There was discussion about disabling SACK
680 * rather than TS in order to fit in better with old,
681 * buggy kernels, but that was deemed to be unnecessary.
682 */
683 doing_ts &= !ireq->sack_ok;
525 } 684 }
526#else 685#else
527 *md5 = NULL; 686 *md5 = NULL;
528#endif 687#endif
529 688
530 /* we can't fit any SACK blocks in a packet with MD5 + TS 689 /* We always send an MSS option. */
531 options. There was discussion about disabling SACK rather than TS in
532 order to fit in better with old, buggy kernels, but that was deemed
533 to be unnecessary. */
534 doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
535
536 opts->mss = mss; 690 opts->mss = mss;
537 size += TCPOLEN_MSS_ALIGNED; 691 remaining -= TCPOLEN_MSS_ALIGNED;
538 692
539 if (likely(ireq->wscale_ok)) { 693 if (likely(ireq->wscale_ok)) {
540 opts->ws = ireq->rcv_wscale; 694 opts->ws = ireq->rcv_wscale;
541 opts->options |= OPTION_WSCALE; 695 opts->options |= OPTION_WSCALE;
542 size += TCPOLEN_WSCALE_ALIGNED; 696 remaining -= TCPOLEN_WSCALE_ALIGNED;
543 } 697 }
544 if (likely(doing_ts)) { 698 if (likely(doing_ts)) {
545 opts->options |= OPTION_TS; 699 opts->options |= OPTION_TS;
546 opts->tsval = TCP_SKB_CB(skb)->when; 700 opts->tsval = TCP_SKB_CB(skb)->when;
547 opts->tsecr = req->ts_recent; 701 opts->tsecr = req->ts_recent;
548 size += TCPOLEN_TSTAMP_ALIGNED; 702 remaining -= TCPOLEN_TSTAMP_ALIGNED;
549 } 703 }
550 if (likely(ireq->sack_ok)) { 704 if (likely(ireq->sack_ok)) {
551 opts->options |= OPTION_SACK_ADVERTISE; 705 opts->options |= OPTION_SACK_ADVERTISE;
552 if (unlikely(!doing_ts)) 706 if (unlikely(!doing_ts))
553 size += TCPOLEN_SACKPERM_ALIGNED; 707 remaining -= TCPOLEN_SACKPERM_ALIGNED;
554 } 708 }
555 709
556 return size; 710 /* Similar rationale to tcp_syn_options() applies here, too.
711 * If the <SYN> options fit, the same options should fit now!
712 */
713 if (*md5 == NULL &&
714 doing_ts &&
715 cookie_plus > TCPOLEN_COOKIE_BASE) {
716 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
717
718 if (0x2 & need) {
719 /* 32-bit multiple */
720 need += 2; /* NOPs */
721 }
722 if (need <= remaining) {
723 opts->options |= OPTION_COOKIE_EXTENSION;
724 opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
725 remaining -= need;
726 } else {
727 /* There's no error return, so flag it. */
728 xvp->cookie_out_never = 1; /* true */
729 opts->hash_size = 0;
730 }
731 }
732 return MAX_TCP_OPTION_SPACE - remaining;
557} 733}
558 734
559/* Compute TCP options for ESTABLISHED sockets. This is not the 735/* Compute TCP options for ESTABLISHED sockets. This is not the
@@ -619,7 +795,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
619 struct tcp_out_options opts; 795 struct tcp_out_options opts;
620 unsigned tcp_options_size, tcp_header_size; 796 unsigned tcp_options_size, tcp_header_size;
621 struct tcp_md5sig_key *md5; 797 struct tcp_md5sig_key *md5;
622 __u8 *md5_hash_location;
623 struct tcphdr *th; 798 struct tcphdr *th;
624 int err; 799 int err;
625 800
@@ -661,8 +836,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
661 836
662 /* Build TCP header and checksum it. */ 837 /* Build TCP header and checksum it. */
663 th = tcp_hdr(skb); 838 th = tcp_hdr(skb);
664 th->source = inet->sport; 839 th->source = inet->inet_sport;
665 th->dest = inet->dport; 840 th->dest = inet->inet_dport;
666 th->seq = htonl(tcb->seq); 841 th->seq = htonl(tcb->seq);
667 th->ack_seq = htonl(tp->rcv_nxt); 842 th->ack_seq = htonl(tp->rcv_nxt);
668 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 843 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
@@ -690,7 +865,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
690 } 865 }
691 } 866 }
692 867
693 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 868 tcp_options_write((__be32 *)(th + 1), tp, &opts);
694 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) 869 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
695 TCP_ECN_send(sk, skb, tcp_header_size); 870 TCP_ECN_send(sk, skb, tcp_header_size);
696 871
@@ -698,7 +873,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
698 /* Calculate the MD5 hash, as we have all we need now */ 873 /* Calculate the MD5 hash, as we have all we need now */
699 if (md5) { 874 if (md5) {
700 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 875 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
701 tp->af_specific->calc_md5_hash(md5_hash_location, 876 tp->af_specific->calc_md5_hash(opts.hash_location,
702 md5, sk, NULL, skb); 877 md5, sk, NULL, skb);
703 } 878 }
704#endif 879#endif
@@ -1627,11 +1802,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1627void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 1802void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1628 int nonagle) 1803 int nonagle)
1629{ 1804{
1630 struct sk_buff *skb = tcp_send_head(sk);
1631
1632 if (!skb)
1633 return;
1634
1635 /* If we are closed, the bytes will have to remain here. 1805 /* If we are closed, the bytes will have to remain here.
1636 * In time closedown will finish, we empty the write queue and 1806 * In time closedown will finish, we empty the write queue and
1637 * all will be happy. 1807 * all will be happy.
@@ -1918,8 +2088,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1918 * case, when window is shrunk to zero. In this case 2088 * case, when window is shrunk to zero. In this case
1919 * our retransmit serves as a zero window probe. 2089 * our retransmit serves as a zero window probe.
1920 */ 2090 */
1921 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) 2091 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
1922 && TCP_SKB_CB(skb)->seq != tp->snd_una) 2092 TCP_SKB_CB(skb)->seq != tp->snd_una)
1923 return -EAGAIN; 2093 return -EAGAIN;
1924 2094
1925 if (skb->len > cur_mss) { 2095 if (skb->len > cur_mss) {
@@ -2219,19 +2389,24 @@ int tcp_send_synack(struct sock *sk)
2219 2389
2220/* Prepare a SYN-ACK. */ 2390/* Prepare a SYN-ACK. */
2221struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2391struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2222 struct request_sock *req) 2392 struct request_sock *req,
2393 struct request_values *rvp)
2223{ 2394{
2395 struct tcp_out_options opts;
2396 struct tcp_extend_values *xvp = tcp_xv(rvp);
2224 struct inet_request_sock *ireq = inet_rsk(req); 2397 struct inet_request_sock *ireq = inet_rsk(req);
2225 struct tcp_sock *tp = tcp_sk(sk); 2398 struct tcp_sock *tp = tcp_sk(sk);
2399 const struct tcp_cookie_values *cvp = tp->cookie_values;
2226 struct tcphdr *th; 2400 struct tcphdr *th;
2227 int tcp_header_size;
2228 struct tcp_out_options opts;
2229 struct sk_buff *skb; 2401 struct sk_buff *skb;
2230 struct tcp_md5sig_key *md5; 2402 struct tcp_md5sig_key *md5;
2231 __u8 *md5_hash_location; 2403 int tcp_header_size;
2232 int mss; 2404 int mss;
2405 int s_data_desired = 0;
2233 2406
2234 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2407 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2408 s_data_desired = cvp->s_data_desired;
2409 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
2235 if (skb == NULL) 2410 if (skb == NULL)
2236 return NULL; 2411 return NULL;
2237 2412
@@ -2254,7 +2429,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2254 &req->rcv_wnd, 2429 &req->rcv_wnd,
2255 &req->window_clamp, 2430 &req->window_clamp,
2256 ireq->wscale_ok, 2431 ireq->wscale_ok,
2257 &rcv_wscale); 2432 &rcv_wscale,
2433 dst_metric(dst, RTAX_INITRWND));
2258 ireq->rcv_wscale = rcv_wscale; 2434 ireq->rcv_wscale = rcv_wscale;
2259 } 2435 }
2260 2436
@@ -2266,8 +2442,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2266#endif 2442#endif
2267 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2443 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2268 tcp_header_size = tcp_synack_options(sk, req, mss, 2444 tcp_header_size = tcp_synack_options(sk, req, mss,
2269 skb, &opts, &md5) + 2445 skb, &opts, &md5, xvp)
2270 sizeof(struct tcphdr); 2446 + sizeof(*th);
2271 2447
2272 skb_push(skb, tcp_header_size); 2448 skb_push(skb, tcp_header_size);
2273 skb_reset_transport_header(skb); 2449 skb_reset_transport_header(skb);
@@ -2284,19 +2460,54 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2284 */ 2460 */
2285 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2461 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2286 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); 2462 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
2463
2464 if (OPTION_COOKIE_EXTENSION & opts.options) {
2465 if (s_data_desired) {
2466 u8 *buf = skb_put(skb, s_data_desired);
2467
2468 /* copy data directly from the listening socket. */
2469 memcpy(buf, cvp->s_data_payload, s_data_desired);
2470 TCP_SKB_CB(skb)->end_seq += s_data_desired;
2471 }
2472
2473 if (opts.hash_size > 0) {
2474 __u32 workspace[SHA_WORKSPACE_WORDS];
2475 u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
2476 u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
2477
2478 /* Secret recipe depends on the Timestamp, (future)
2479 * Sequence and Acknowledgment Numbers, Initiator
2480 * Cookie, and others handled by IP variant caller.
2481 */
2482 *tail-- ^= opts.tsval;
2483 *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
2484 *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
2485
2486 /* recommended */
2487 *tail-- ^= ((th->dest << 16) | th->source);
2488 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
2489
2490 sha_transform((__u32 *)&xvp->cookie_bakery[0],
2491 (char *)mess,
2492 &workspace[0]);
2493 opts.hash_location =
2494 (__u8 *)&xvp->cookie_bakery[0];
2495 }
2496 }
2497
2287 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2498 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2288 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2499 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2289 2500
2290 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2501 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2291 th->window = htons(min(req->rcv_wnd, 65535U)); 2502 th->window = htons(min(req->rcv_wnd, 65535U));
2292 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 2503 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2293 th->doff = (tcp_header_size >> 2); 2504 th->doff = (tcp_header_size >> 2);
2294 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 2505 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
2295 2506
2296#ifdef CONFIG_TCP_MD5SIG 2507#ifdef CONFIG_TCP_MD5SIG
2297 /* Okay, we have all we need - do the md5 hash if needed */ 2508 /* Okay, we have all we need - do the md5 hash if needed */
2298 if (md5) { 2509 if (md5) {
2299 tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, 2510 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2300 md5, NULL, req, skb); 2511 md5, NULL, req, skb);
2301 } 2512 }
2302#endif 2513#endif
@@ -2342,7 +2553,8 @@ static void tcp_connect_init(struct sock *sk)
2342 &tp->rcv_wnd, 2553 &tp->rcv_wnd,
2343 &tp->window_clamp, 2554 &tp->window_clamp,
2344 sysctl_tcp_window_scaling, 2555 sysctl_tcp_window_scaling,
2345 &rcv_wscale); 2556 &rcv_wscale,
2557 dst_metric(dst, RTAX_INITRWND));
2346 2558
2347 tp->rx_opt.rcv_wscale = rcv_wscale; 2559 tp->rx_opt.rcv_wscale = rcv_wscale;
2348 tp->rcv_ssthresh = tp->rcv_wnd; 2560 tp->rcv_ssthresh = tp->rcv_wnd;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 59f5b5e7c566..f8efada580e8 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -22,6 +22,7 @@
22#include <linux/kprobes.h> 22#include <linux/kprobes.h>
23#include <linux/socket.h> 23#include <linux/socket.h>
24#include <linux/tcp.h> 24#include <linux/tcp.h>
25#include <linux/slab.h>
25#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/ktime.h> 28#include <linux/ktime.h>
@@ -39,9 +40,9 @@ static int port __read_mostly = 0;
39MODULE_PARM_DESC(port, "Port to match (0=all)"); 40MODULE_PARM_DESC(port, "Port to match (0=all)");
40module_param(port, int, 0); 41module_param(port, int, 0);
41 42
42static int bufsize __read_mostly = 4096; 43static unsigned int bufsize __read_mostly = 4096;
43MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); 44MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
44module_param(bufsize, int, 0); 45module_param(bufsize, uint, 0);
45 46
46static int full __read_mostly; 47static int full __read_mostly;
47MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); 48MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
@@ -75,12 +76,12 @@ static struct {
75 76
76static inline int tcp_probe_used(void) 77static inline int tcp_probe_used(void)
77{ 78{
78 return (tcp_probe.head - tcp_probe.tail) % bufsize; 79 return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
79} 80}
80 81
81static inline int tcp_probe_avail(void) 82static inline int tcp_probe_avail(void)
82{ 83{
83 return bufsize - tcp_probe_used(); 84 return bufsize - tcp_probe_used() - 1;
84} 85}
85 86
86/* 87/*
@@ -94,8 +95,9 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
94 const struct inet_sock *inet = inet_sk(sk); 95 const struct inet_sock *inet = inet_sk(sk);
95 96
96 /* Only update if port matches */ 97 /* Only update if port matches */
97 if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) 98 if ((port == 0 || ntohs(inet->inet_dport) == port ||
98 && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { 99 ntohs(inet->inet_sport) == port) &&
100 (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
99 101
100 spin_lock(&tcp_probe.lock); 102 spin_lock(&tcp_probe.lock);
101 /* If log fills, just silently drop */ 103 /* If log fills, just silently drop */
@@ -103,10 +105,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
103 struct tcp_log *p = tcp_probe.log + tcp_probe.head; 105 struct tcp_log *p = tcp_probe.log + tcp_probe.head;
104 106
105 p->tstamp = ktime_get(); 107 p->tstamp = ktime_get();
106 p->saddr = inet->saddr; 108 p->saddr = inet->inet_saddr;
107 p->sport = inet->sport; 109 p->sport = inet->inet_sport;
108 p->daddr = inet->daddr; 110 p->daddr = inet->inet_daddr;
109 p->dport = inet->dport; 111 p->dport = inet->inet_dport;
110 p->length = skb->len; 112 p->length = skb->len;
111 p->snd_nxt = tp->snd_nxt; 113 p->snd_nxt = tp->snd_nxt;
112 p->snd_una = tp->snd_una; 114 p->snd_una = tp->snd_una;
@@ -115,7 +117,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
115 p->ssthresh = tcp_current_ssthresh(sk); 117 p->ssthresh = tcp_current_ssthresh(sk);
116 p->srtt = tp->srtt >> 3; 118 p->srtt = tp->srtt >> 3;
117 119
118 tcp_probe.head = (tcp_probe.head + 1) % bufsize; 120 tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
119 } 121 }
120 tcp_probe.lastcwnd = tp->snd_cwnd; 122 tcp_probe.lastcwnd = tp->snd_cwnd;
121 spin_unlock(&tcp_probe.lock); 123 spin_unlock(&tcp_probe.lock);
@@ -148,7 +150,7 @@ static int tcpprobe_open(struct inode * inode, struct file * file)
148static int tcpprobe_sprint(char *tbuf, int n) 150static int tcpprobe_sprint(char *tbuf, int n)
149{ 151{
150 const struct tcp_log *p 152 const struct tcp_log *p
151 = tcp_probe.log + tcp_probe.tail % bufsize; 153 = tcp_probe.log + tcp_probe.tail;
152 struct timespec tv 154 struct timespec tv
153 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
154 156
@@ -191,7 +193,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
191 width = tcpprobe_sprint(tbuf, sizeof(tbuf)); 193 width = tcpprobe_sprint(tbuf, sizeof(tbuf));
192 194
193 if (cnt + width < len) 195 if (cnt + width < len)
194 tcp_probe.tail = (tcp_probe.tail + 1) % bufsize; 196 tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
195 197
196 spin_unlock_bh(&tcp_probe.lock); 198 spin_unlock_bh(&tcp_probe.lock);
197 199
@@ -221,9 +223,10 @@ static __init int tcpprobe_init(void)
221 init_waitqueue_head(&tcp_probe.wait); 223 init_waitqueue_head(&tcp_probe.wait);
222 spin_lock_init(&tcp_probe.lock); 224 spin_lock_init(&tcp_probe.lock);
223 225
224 if (bufsize < 0) 226 if (bufsize == 0)
225 return -EINVAL; 227 return -EINVAL;
226 228
229 bufsize = roundup_pow_of_two(bufsize);
227 tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); 230 tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
228 if (!tcp_probe.log) 231 if (!tcp_probe.log)
229 goto err0; 232 goto err0;
@@ -235,7 +238,7 @@ static __init int tcpprobe_init(void)
235 if (ret) 238 if (ret)
236 goto err1; 239 goto err1;
237 240
238 pr_info("TCP probe registered (port=%d)\n", port); 241 pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
239 return 0; 242 return 0;
240 err1: 243 err1:
241 proc_net_remove(&init_net, procname); 244 proc_net_remove(&init_net, procname);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index cdb2ca7684d4..8a0ab2977f1f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/gfp.h>
22#include <net/tcp.h> 23#include <net/tcp.h>
23 24
24int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; 25int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
@@ -29,6 +30,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
29int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; 30int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
30int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; 31int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
31int sysctl_tcp_orphan_retries __read_mostly; 32int sysctl_tcp_orphan_retries __read_mostly;
33int sysctl_tcp_thin_linear_timeouts __read_mostly;
32 34
33static void tcp_write_timer(unsigned long); 35static void tcp_write_timer(unsigned long);
34static void tcp_delack_timer(unsigned long); 36static void tcp_delack_timer(unsigned long);
@@ -132,6 +134,35 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
132 } 134 }
133} 135}
134 136
137/* This function calculates a "timeout" which is equivalent to the timeout of a
138 * TCP connection after "boundary" unsuccessful, exponentially backed-off
139 * retransmissions with an initial RTO of TCP_RTO_MIN.
140 */
141static bool retransmits_timed_out(struct sock *sk,
142 unsigned int boundary)
143{
144 unsigned int timeout, linear_backoff_thresh;
145 unsigned int start_ts;
146
147 if (!inet_csk(sk)->icsk_retransmits)
148 return false;
149
150 if (unlikely(!tcp_sk(sk)->retrans_stamp))
151 start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
152 else
153 start_ts = tcp_sk(sk)->retrans_stamp;
154
155 linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
156
157 if (boundary <= linear_backoff_thresh)
158 timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
159 else
160 timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
161 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
162
163 return (tcp_time_stamp - start_ts) >= timeout;
164}
165
135/* A write timeout has occurred. Process the after effects. */ 166/* A write timeout has occurred. Process the after effects. */
136static int tcp_write_timeout(struct sock *sk) 167static int tcp_write_timeout(struct sock *sk)
137{ 168{
@@ -141,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk)
141 172
142 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 173 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
143 if (icsk->icsk_retransmits) 174 if (icsk->icsk_retransmits)
144 dst_negative_advice(&sk->sk_dst_cache); 175 dst_negative_advice(&sk->sk_dst_cache, sk);
145 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 176 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
146 } else { 177 } else {
147 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 178 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
148 /* Black hole detection */ 179 /* Black hole detection */
149 tcp_mtu_probing(icsk, sk); 180 tcp_mtu_probing(icsk, sk);
150 181
151 dst_negative_advice(&sk->sk_dst_cache); 182 dst_negative_advice(&sk->sk_dst_cache, sk);
152 } 183 }
153 184
154 retry_until = sysctl_tcp_retries2; 185 retry_until = sysctl_tcp_retries2;
@@ -303,15 +334,15 @@ void tcp_retransmit_timer(struct sock *sk)
303 struct inet_sock *inet = inet_sk(sk); 334 struct inet_sock *inet = inet_sk(sk);
304 if (sk->sk_family == AF_INET) { 335 if (sk->sk_family == AF_INET) {
305 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 336 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
306 &inet->daddr, ntohs(inet->dport), 337 &inet->inet_daddr, ntohs(inet->inet_dport),
307 inet->num, tp->snd_una, tp->snd_nxt); 338 inet->inet_num, tp->snd_una, tp->snd_nxt);
308 } 339 }
309#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 340#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
310 else if (sk->sk_family == AF_INET6) { 341 else if (sk->sk_family == AF_INET6) {
311 struct ipv6_pinfo *np = inet6_sk(sk); 342 struct ipv6_pinfo *np = inet6_sk(sk);
312 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 343 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
313 &np->daddr, ntohs(inet->dport), 344 &np->daddr, ntohs(inet->inet_dport),
314 inet->num, tp->snd_una, tp->snd_nxt); 345 inet->inet_num, tp->snd_una, tp->snd_nxt);
315 } 346 }
316#endif 347#endif
317#endif 348#endif
@@ -386,7 +417,25 @@ void tcp_retransmit_timer(struct sock *sk)
386 icsk->icsk_retransmits++; 417 icsk->icsk_retransmits++;
387 418
388out_reset_timer: 419out_reset_timer:
389 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 420 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
421 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
422 * might be increased if the stream oscillates between thin and thick,
423 * thus the old value might already be too high compared to the value
424 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
425 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
426 * exponential backoff behaviour to avoid continue hammering
427 * linear-timeout retransmissions into a black hole
428 */
429 if (sk->sk_state == TCP_ESTABLISHED &&
430 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
431 tcp_stream_is_thin(tp) &&
432 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
433 icsk->icsk_backoff = 0;
434 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
435 } else {
436 /* Use normal (exponential) backoff */
437 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
438 }
390 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 439 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
391 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) 440 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
392 __sk_dst_reset(sk); 441 __sk_dst_reset(sk);
@@ -445,6 +494,12 @@ static void tcp_synack_timer(struct sock *sk)
445 TCP_TIMEOUT_INIT, TCP_RTO_MAX); 494 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
446} 495}
447 496
497void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
498{
499 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
500}
501EXPORT_SYMBOL(tcp_syn_ack_timeout);
502
448void tcp_set_keepalive(struct sock *sk, int val) 503void tcp_set_keepalive(struct sock *sk, int val)
449{ 504{
450 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) 505 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index e9bbff746488..b612acf76183 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
165 * every other rtt. 165 * every other rtt.
166 */ 166 */
167 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 167 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
168 if (veno->inc 168 if (veno->inc &&
169 && tp->snd_cwnd < 169 tp->snd_cwnd < tp->snd_cwnd_clamp) {
170 tp->snd_cwnd_clamp) {
171 tp->snd_cwnd++; 170 tp->snd_cwnd++;
172 veno->inc = 0; 171 veno->inc = 0;
173 } else 172 } else
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 66b6821b984e..a0f240358892 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
157 157
158 if (queue > TCP_YEAH_ALPHA || 158 if (queue > TCP_YEAH_ALPHA ||
159 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { 159 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
160 if (queue > TCP_YEAH_ALPHA 160 if (queue > TCP_YEAH_ALPHA &&
161 && tp->snd_cwnd > yeah->reno_count) { 161 tp->snd_cwnd > yeah->reno_count) {
162 u32 reduction = min(queue / TCP_YEAH_GAMMA , 162 u32 reduction = min(queue / TCP_YEAH_GAMMA ,
163 tp->snd_cwnd >> TCP_YEAH_EPSILON); 163 tp->snd_cwnd >> TCP_YEAH_EPSILON);
164 164
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3959e0ca456a..3b3813cc80b9 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -8,6 +8,7 @@
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/skbuff.h> 10#include <linux/skbuff.h>
11#include <linux/slab.h>
11#include <net/icmp.h> 12#include <net/icmp.h>
12#include <net/ip.h> 13#include <net/ip.h>
13#include <net/protocol.h> 14#include <net/protocol.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fa9f70e4b19..c36522a0f113 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,6 +95,7 @@
95#include <linux/mm.h> 95#include <linux/mm.h>
96#include <linux/inet.h> 96#include <linux/inet.h>
97#include <linux/netdevice.h> 97#include <linux/netdevice.h>
98#include <linux/slab.h>
98#include <net/tcp_states.h> 99#include <net/tcp_states.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/proc_fs.h> 101#include <linux/proc_fs.h>
@@ -106,7 +107,7 @@
106#include <net/xfrm.h> 107#include <net/xfrm.h>
107#include "udp_impl.h" 108#include "udp_impl.h"
108 109
109struct udp_table udp_table; 110struct udp_table udp_table __read_mostly;
110EXPORT_SYMBOL(udp_table); 111EXPORT_SYMBOL(udp_table);
111 112
112int sysctl_udp_mem[3] __read_mostly; 113int sysctl_udp_mem[3] __read_mostly;
@@ -121,28 +122,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
121atomic_t udp_memory_allocated; 122atomic_t udp_memory_allocated;
122EXPORT_SYMBOL(udp_memory_allocated); 123EXPORT_SYMBOL(udp_memory_allocated);
123 124
124#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) 125#define MAX_UDP_PORTS 65536
126#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
125 127
126static int udp_lib_lport_inuse(struct net *net, __u16 num, 128static int udp_lib_lport_inuse(struct net *net, __u16 num,
127 const struct udp_hslot *hslot, 129 const struct udp_hslot *hslot,
128 unsigned long *bitmap, 130 unsigned long *bitmap,
129 struct sock *sk, 131 struct sock *sk,
130 int (*saddr_comp)(const struct sock *sk1, 132 int (*saddr_comp)(const struct sock *sk1,
131 const struct sock *sk2)) 133 const struct sock *sk2),
134 unsigned int log)
132{ 135{
133 struct sock *sk2; 136 struct sock *sk2;
134 struct hlist_nulls_node *node; 137 struct hlist_nulls_node *node;
135 138
136 sk_nulls_for_each(sk2, node, &hslot->head) 139 sk_nulls_for_each(sk2, node, &hslot->head)
137 if (net_eq(sock_net(sk2), net) && 140 if (net_eq(sock_net(sk2), net) &&
138 sk2 != sk && 141 sk2 != sk &&
139 (bitmap || sk2->sk_hash == num) && 142 (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
140 (!sk2->sk_reuse || !sk->sk_reuse) && 143 (!sk2->sk_reuse || !sk->sk_reuse) &&
141 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if 144 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
142 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 145 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
143 (*saddr_comp)(sk, sk2)) { 146 (*saddr_comp)(sk, sk2)) {
144 if (bitmap) 147 if (bitmap)
145 __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, 148 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
146 bitmap); 149 bitmap);
147 else 150 else
148 return 1; 151 return 1;
@@ -150,18 +153,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
150 return 0; 153 return 0;
151} 154}
152 155
156/*
157 * Note: we still hold spinlock of primary hash chain, so no other writer
158 * can insert/delete a socket with local_port == num
159 */
160static int udp_lib_lport_inuse2(struct net *net, __u16 num,
161 struct udp_hslot *hslot2,
162 struct sock *sk,
163 int (*saddr_comp)(const struct sock *sk1,
164 const struct sock *sk2))
165{
166 struct sock *sk2;
167 struct hlist_nulls_node *node;
168 int res = 0;
169
170 spin_lock(&hslot2->lock);
171 udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
172 if (net_eq(sock_net(sk2), net) &&
173 sk2 != sk &&
174 (udp_sk(sk2)->udp_port_hash == num) &&
175 (!sk2->sk_reuse || !sk->sk_reuse) &&
176 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
177 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
178 (*saddr_comp)(sk, sk2)) {
179 res = 1;
180 break;
181 }
182 spin_unlock(&hslot2->lock);
183 return res;
184}
185
153/** 186/**
154 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 187 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
155 * 188 *
156 * @sk: socket struct in question 189 * @sk: socket struct in question
157 * @snum: port number to look up 190 * @snum: port number to look up
158 * @saddr_comp: AF-dependent comparison of bound local IP addresses 191 * @saddr_comp: AF-dependent comparison of bound local IP addresses
192 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
193 * with NULL address
159 */ 194 */
160int udp_lib_get_port(struct sock *sk, unsigned short snum, 195int udp_lib_get_port(struct sock *sk, unsigned short snum,
161 int (*saddr_comp)(const struct sock *sk1, 196 int (*saddr_comp)(const struct sock *sk1,
162 const struct sock *sk2)) 197 const struct sock *sk2),
198 unsigned int hash2_nulladdr)
163{ 199{
164 struct udp_hslot *hslot; 200 struct udp_hslot *hslot, *hslot2;
165 struct udp_table *udptable = sk->sk_prot->h.udp_table; 201 struct udp_table *udptable = sk->sk_prot->h.udp_table;
166 int error = 1; 202 int error = 1;
167 struct net *net = sock_net(sk); 203 struct net *net = sock_net(sk);
@@ -180,13 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
180 /* 216 /*
181 * force rand to be an odd multiple of UDP_HTABLE_SIZE 217 * force rand to be an odd multiple of UDP_HTABLE_SIZE
182 */ 218 */
183 rand = (rand | 1) * UDP_HTABLE_SIZE; 219 rand = (rand | 1) * (udptable->mask + 1);
184 for (last = first + UDP_HTABLE_SIZE; first != last; first++) { 220 last = first + udptable->mask + 1;
185 hslot = &udptable->hash[udp_hashfn(net, first)]; 221 do {
222 hslot = udp_hashslot(udptable, net, first);
186 bitmap_zero(bitmap, PORTS_PER_CHAIN); 223 bitmap_zero(bitmap, PORTS_PER_CHAIN);
187 spin_lock_bh(&hslot->lock); 224 spin_lock_bh(&hslot->lock);
188 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 225 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
189 saddr_comp); 226 saddr_comp, udptable->log);
190 227
191 snum = first; 228 snum = first;
192 /* 229 /*
@@ -196,25 +233,59 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
196 */ 233 */
197 do { 234 do {
198 if (low <= snum && snum <= high && 235 if (low <= snum && snum <= high &&
199 !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) 236 !test_bit(snum >> udptable->log, bitmap))
200 goto found; 237 goto found;
201 snum += rand; 238 snum += rand;
202 } while (snum != first); 239 } while (snum != first);
203 spin_unlock_bh(&hslot->lock); 240 spin_unlock_bh(&hslot->lock);
204 } 241 } while (++first != last);
205 goto fail; 242 goto fail;
206 } else { 243 } else {
207 hslot = &udptable->hash[udp_hashfn(net, snum)]; 244 hslot = udp_hashslot(udptable, net, snum);
208 spin_lock_bh(&hslot->lock); 245 spin_lock_bh(&hslot->lock);
209 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) 246 if (hslot->count > 10) {
247 int exist;
248 unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
249
250 slot2 &= udptable->mask;
251 hash2_nulladdr &= udptable->mask;
252
253 hslot2 = udp_hashslot2(udptable, slot2);
254 if (hslot->count < hslot2->count)
255 goto scan_primary_hash;
256
257 exist = udp_lib_lport_inuse2(net, snum, hslot2,
258 sk, saddr_comp);
259 if (!exist && (hash2_nulladdr != slot2)) {
260 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
261 exist = udp_lib_lport_inuse2(net, snum, hslot2,
262 sk, saddr_comp);
263 }
264 if (exist)
265 goto fail_unlock;
266 else
267 goto found;
268 }
269scan_primary_hash:
270 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
271 saddr_comp, 0))
210 goto fail_unlock; 272 goto fail_unlock;
211 } 273 }
212found: 274found:
213 inet_sk(sk)->num = snum; 275 inet_sk(sk)->inet_num = snum;
214 sk->sk_hash = snum; 276 udp_sk(sk)->udp_port_hash = snum;
277 udp_sk(sk)->udp_portaddr_hash ^= snum;
215 if (sk_unhashed(sk)) { 278 if (sk_unhashed(sk)) {
216 sk_nulls_add_node_rcu(sk, &hslot->head); 279 sk_nulls_add_node_rcu(sk, &hslot->head);
280 hslot->count++;
217 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 281 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
282
283 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
284 spin_lock(&hslot2->lock);
285 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
286 &hslot2->head);
287 hslot2->count++;
288 spin_unlock(&hslot2->lock);
218 } 289 }
219 error = 0; 290 error = 0;
220fail_unlock: 291fail_unlock:
@@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
229 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 300 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
230 301
231 return (!ipv6_only_sock(sk2) && 302 return (!ipv6_only_sock(sk2) &&
232 (!inet1->rcv_saddr || !inet2->rcv_saddr || 303 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
233 inet1->rcv_saddr == inet2->rcv_saddr)); 304 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
305}
306
307static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
308 unsigned int port)
309{
310 return jhash_1word(saddr, net_hash_mix(net)) ^ port;
234} 311}
235 312
236int udp_v4_get_port(struct sock *sk, unsigned short snum) 313int udp_v4_get_port(struct sock *sk, unsigned short snum)
237{ 314{
238 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 315 unsigned int hash2_nulladdr =
316 udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
317 unsigned int hash2_partial =
318 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
319
320 /* precompute partial secondary hash */
321 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
322 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
239} 323}
240 324
241static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, 325static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -244,23 +328,23 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
244{ 328{
245 int score = -1; 329 int score = -1;
246 330
247 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 331 if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
248 !ipv6_only_sock(sk)) { 332 !ipv6_only_sock(sk)) {
249 struct inet_sock *inet = inet_sk(sk); 333 struct inet_sock *inet = inet_sk(sk);
250 334
251 score = (sk->sk_family == PF_INET ? 1 : 0); 335 score = (sk->sk_family == PF_INET ? 1 : 0);
252 if (inet->rcv_saddr) { 336 if (inet->inet_rcv_saddr) {
253 if (inet->rcv_saddr != daddr) 337 if (inet->inet_rcv_saddr != daddr)
254 return -1; 338 return -1;
255 score += 2; 339 score += 2;
256 } 340 }
257 if (inet->daddr) { 341 if (inet->inet_daddr) {
258 if (inet->daddr != saddr) 342 if (inet->inet_daddr != saddr)
259 return -1; 343 return -1;
260 score += 2; 344 score += 2;
261 } 345 }
262 if (inet->dport) { 346 if (inet->inet_dport) {
263 if (inet->dport != sport) 347 if (inet->inet_dport != sport)
264 return -1; 348 return -1;
265 score += 2; 349 score += 2;
266 } 350 }
@@ -273,6 +357,89 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
273 return score; 357 return score;
274} 358}
275 359
360/*
361 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
362 */
363#define SCORE2_MAX (1 + 2 + 2 + 2)
364static inline int compute_score2(struct sock *sk, struct net *net,
365 __be32 saddr, __be16 sport,
366 __be32 daddr, unsigned int hnum, int dif)
367{
368 int score = -1;
369
370 if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
371 struct inet_sock *inet = inet_sk(sk);
372
373 if (inet->inet_rcv_saddr != daddr)
374 return -1;
375 if (inet->inet_num != hnum)
376 return -1;
377
378 score = (sk->sk_family == PF_INET ? 1 : 0);
379 if (inet->inet_daddr) {
380 if (inet->inet_daddr != saddr)
381 return -1;
382 score += 2;
383 }
384 if (inet->inet_dport) {
385 if (inet->inet_dport != sport)
386 return -1;
387 score += 2;
388 }
389 if (sk->sk_bound_dev_if) {
390 if (sk->sk_bound_dev_if != dif)
391 return -1;
392 score += 2;
393 }
394 }
395 return score;
396}
397
398
399/* called with read_rcu_lock() */
400static struct sock *udp4_lib_lookup2(struct net *net,
401 __be32 saddr, __be16 sport,
402 __be32 daddr, unsigned int hnum, int dif,
403 struct udp_hslot *hslot2, unsigned int slot2)
404{
405 struct sock *sk, *result;
406 struct hlist_nulls_node *node;
407 int score, badness;
408
409begin:
410 result = NULL;
411 badness = -1;
412 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
413 score = compute_score2(sk, net, saddr, sport,
414 daddr, hnum, dif);
415 if (score > badness) {
416 result = sk;
417 badness = score;
418 if (score == SCORE2_MAX)
419 goto exact_match;
420 }
421 }
422 /*
423 * if the nulls value we got at the end of this lookup is
424 * not the expected one, we must restart lookup.
425 * We probably met an item that was moved to another chain.
426 */
427 if (get_nulls_value(node) != slot2)
428 goto begin;
429
430 if (result) {
431exact_match:
432 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
433 result = NULL;
434 else if (unlikely(compute_score2(result, net, saddr, sport,
435 daddr, hnum, dif) < badness)) {
436 sock_put(result);
437 goto begin;
438 }
439 }
440 return result;
441}
442
276/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 443/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
277 * harder than this. -DaveM 444 * harder than this. -DaveM
278 */ 445 */
@@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
283 struct sock *sk, *result; 450 struct sock *sk, *result;
284 struct hlist_nulls_node *node; 451 struct hlist_nulls_node *node;
285 unsigned short hnum = ntohs(dport); 452 unsigned short hnum = ntohs(dport);
286 unsigned int hash = udp_hashfn(net, hnum); 453 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
287 struct udp_hslot *hslot = &udptable->hash[hash]; 454 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
288 int score, badness; 455 int score, badness;
289 456
290 rcu_read_lock(); 457 rcu_read_lock();
458 if (hslot->count > 10) {
459 hash2 = udp4_portaddr_hash(net, daddr, hnum);
460 slot2 = hash2 & udptable->mask;
461 hslot2 = &udptable->hash2[slot2];
462 if (hslot->count < hslot2->count)
463 goto begin;
464
465 result = udp4_lib_lookup2(net, saddr, sport,
466 daddr, hnum, dif,
467 hslot2, slot2);
468 if (!result) {
469 hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
470 slot2 = hash2 & udptable->mask;
471 hslot2 = &udptable->hash2[slot2];
472 if (hslot->count < hslot2->count)
473 goto begin;
474
475 result = udp4_lib_lookup2(net, saddr, sport,
476 INADDR_ANY, hnum, dif,
477 hslot2, slot2);
478 }
479 rcu_read_unlock();
480 return result;
481 }
291begin: 482begin:
292 result = NULL; 483 result = NULL;
293 badness = -1; 484 badness = -1;
@@ -304,7 +495,7 @@ begin:
304 * not the expected one, we must restart lookup. 495 * not the expected one, we must restart lookup.
305 * We probably met an item that was moved to another chain. 496 * We probably met an item that was moved to another chain.
306 */ 497 */
307 if (get_nulls_value(node) != hash) 498 if (get_nulls_value(node) != slot)
308 goto begin; 499 goto begin;
309 500
310 if (result) { 501 if (result) {
@@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
354 sk_nulls_for_each_from(s, node) { 545 sk_nulls_for_each_from(s, node) {
355 struct inet_sock *inet = inet_sk(s); 546 struct inet_sock *inet = inet_sk(s);
356 547
357 if (!net_eq(sock_net(s), net) || 548 if (!net_eq(sock_net(s), net) ||
358 s->sk_hash != hnum || 549 udp_sk(s)->udp_port_hash != hnum ||
359 (inet->daddr && inet->daddr != rmt_addr) || 550 (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
360 (inet->dport != rmt_port && inet->dport) || 551 (inet->inet_dport != rmt_port && inet->inet_dport) ||
361 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 552 (inet->inet_rcv_saddr &&
362 ipv6_only_sock(s) || 553 inet->inet_rcv_saddr != loc_addr) ||
554 ipv6_only_sock(s) ||
363 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) 555 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
364 continue; 556 continue;
365 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) 557 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
642 } else { 834 } else {
643 if (sk->sk_state != TCP_ESTABLISHED) 835 if (sk->sk_state != TCP_ESTABLISHED)
644 return -EDESTADDRREQ; 836 return -EDESTADDRREQ;
645 daddr = inet->daddr; 837 daddr = inet->inet_daddr;
646 dport = inet->dport; 838 dport = inet->inet_dport;
647 /* Open fast path for connected socket. 839 /* Open fast path for connected socket.
648 Route will not be used, if at least one option is set. 840 Route will not be used, if at least one option is set.
649 */ 841 */
650 connected = 1; 842 connected = 1;
651 } 843 }
652 ipc.addr = inet->saddr; 844 ipc.addr = inet->inet_saddr;
653 845
654 ipc.oif = sk->sk_bound_dev_if; 846 ipc.oif = sk->sk_bound_dev_if;
655 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 847 err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
704 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
705 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
706 .uli_u = { .ports = 898 .uli_u = { .ports =
707 { .sport = inet->sport, 899 { .sport = inet->inet_sport,
708 .dport = dport } } }; 900 .dport = dport } } };
709 struct net *net = sock_net(sk); 901 struct net *net = sock_net(sk);
710 902
@@ -748,7 +940,7 @@ back_from_confirm:
748 inet->cork.fl.fl4_dst = daddr; 940 inet->cork.fl.fl4_dst = daddr;
749 inet->cork.fl.fl_ip_dport = dport; 941 inet->cork.fl.fl_ip_dport = dport;
750 inet->cork.fl.fl4_src = saddr; 942 inet->cork.fl.fl4_src = saddr;
751 inet->cork.fl.fl_ip_sport = inet->sport; 943 inet->cork.fl.fl_ip_sport = inet->inet_sport;
752 up->pending = AF_INET; 944 up->pending = AF_INET;
753 945
754do_append_data: 946do_append_data:
@@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk)
862 udp_lib_checksum_complete(skb)) { 1054 udp_lib_checksum_complete(skb)) {
863 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, 1055 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
864 IS_UDPLITE(sk)); 1056 IS_UDPLITE(sk));
1057 atomic_inc(&sk->sk_drops);
865 __skb_unlink(skb, rcvq); 1058 __skb_unlink(skb, rcvq);
866 __skb_queue_tail(&list_kill, skb); 1059 __skb_queue_tail(&list_kill, skb);
867 } 1060 }
@@ -925,7 +1118,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
925 struct inet_sock *inet = inet_sk(sk); 1118 struct inet_sock *inet = inet_sk(sk);
926 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 1119 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
927 struct sk_buff *skb; 1120 struct sk_buff *skb;
928 unsigned int ulen, copied; 1121 unsigned int ulen;
929 int peeked; 1122 int peeked;
930 int err; 1123 int err;
931 int is_udplite = IS_UDPLITE(sk); 1124 int is_udplite = IS_UDPLITE(sk);
@@ -946,10 +1139,9 @@ try_again:
946 goto out; 1139 goto out;
947 1140
948 ulen = skb->len - sizeof(struct udphdr); 1141 ulen = skb->len - sizeof(struct udphdr);
949 copied = len; 1142 if (len > ulen)
950 if (copied > ulen) 1143 len = ulen;
951 copied = ulen; 1144 else if (len < ulen)
952 else if (copied < ulen)
953 msg->msg_flags |= MSG_TRUNC; 1145 msg->msg_flags |= MSG_TRUNC;
954 1146
955 /* 1147 /*
@@ -958,14 +1150,14 @@ try_again:
958 * coverage checksum (UDP-Lite), do it before the copy. 1150 * coverage checksum (UDP-Lite), do it before the copy.
959 */ 1151 */
960 1152
961 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { 1153 if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
962 if (udp_lib_checksum_complete(skb)) 1154 if (udp_lib_checksum_complete(skb))
963 goto csum_copy_err; 1155 goto csum_copy_err;
964 } 1156 }
965 1157
966 if (skb_csum_unnecessary(skb)) 1158 if (skb_csum_unnecessary(skb))
967 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 1159 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
968 msg->msg_iov, copied); 1160 msg->msg_iov, len);
969 else { 1161 else {
970 err = skb_copy_and_csum_datagram_iovec(skb, 1162 err = skb_copy_and_csum_datagram_iovec(skb,
971 sizeof(struct udphdr), 1163 sizeof(struct udphdr),
@@ -982,7 +1174,7 @@ try_again:
982 UDP_INC_STATS_USER(sock_net(sk), 1174 UDP_INC_STATS_USER(sock_net(sk),
983 UDP_MIB_INDATAGRAMS, is_udplite); 1175 UDP_MIB_INDATAGRAMS, is_udplite);
984 1176
985 sock_recv_timestamp(msg, sk, skb); 1177 sock_recv_ts_and_drops(msg, sk, skb);
986 1178
987 /* Copy the address. */ 1179 /* Copy the address. */
988 if (sin) { 1180 if (sin) {
@@ -994,7 +1186,7 @@ try_again:
994 if (inet->cmsg_flags) 1186 if (inet->cmsg_flags)
995 ip_cmsg_recv(msg, skb); 1187 ip_cmsg_recv(msg, skb);
996 1188
997 err = copied; 1189 err = len;
998 if (flags & MSG_TRUNC) 1190 if (flags & MSG_TRUNC)
999 err = ulen; 1191 err = ulen;
1000 1192
@@ -1023,15 +1215,15 @@ int udp_disconnect(struct sock *sk, int flags)
1023 */ 1215 */
1024 1216
1025 sk->sk_state = TCP_CLOSE; 1217 sk->sk_state = TCP_CLOSE;
1026 inet->daddr = 0; 1218 inet->inet_daddr = 0;
1027 inet->dport = 0; 1219 inet->inet_dport = 0;
1028 sk->sk_bound_dev_if = 0; 1220 sk->sk_bound_dev_if = 0;
1029 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1221 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1030 inet_reset_saddr(sk); 1222 inet_reset_saddr(sk);
1031 1223
1032 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { 1224 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
1033 sk->sk_prot->unhash(sk); 1225 sk->sk_prot->unhash(sk);
1034 inet->sport = 0; 1226 inet->inet_sport = 0;
1035 } 1227 }
1036 sk_dst_reset(sk); 1228 sk_dst_reset(sk);
1037 return 0; 1229 return 0;
@@ -1042,13 +1234,22 @@ void udp_lib_unhash(struct sock *sk)
1042{ 1234{
1043 if (sk_hashed(sk)) { 1235 if (sk_hashed(sk)) {
1044 struct udp_table *udptable = sk->sk_prot->h.udp_table; 1236 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1045 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); 1237 struct udp_hslot *hslot, *hslot2;
1046 struct udp_hslot *hslot = &udptable->hash[hash]; 1238
1239 hslot = udp_hashslot(udptable, sock_net(sk),
1240 udp_sk(sk)->udp_port_hash);
1241 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1047 1242
1048 spin_lock_bh(&hslot->lock); 1243 spin_lock_bh(&hslot->lock);
1049 if (sk_nulls_del_node_init_rcu(sk)) { 1244 if (sk_nulls_del_node_init_rcu(sk)) {
1050 inet_sk(sk)->num = 0; 1245 hslot->count--;
1246 inet_sk(sk)->inet_num = 0;
1051 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 1247 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
1248
1249 spin_lock(&hslot2->lock);
1250 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1251 hslot2->count--;
1252 spin_unlock(&hslot2->lock);
1052 } 1253 }
1053 spin_unlock_bh(&hslot->lock); 1254 spin_unlock_bh(&hslot->lock);
1054 } 1255 }
@@ -1057,25 +1258,22 @@ EXPORT_SYMBOL(udp_lib_unhash);
1057 1258
1058static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1259static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1059{ 1260{
1060 int is_udplite = IS_UDPLITE(sk); 1261 int rc = sock_queue_rcv_skb(sk, skb);
1061 int rc; 1262
1263 if (rc < 0) {
1264 int is_udplite = IS_UDPLITE(sk);
1062 1265
1063 if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
1064 /* Note that an ENOMEM error is charged twice */ 1266 /* Note that an ENOMEM error is charged twice */
1065 if (rc == -ENOMEM) { 1267 if (rc == -ENOMEM)
1066 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1268 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1067 is_udplite); 1269 is_udplite);
1068 atomic_inc(&sk->sk_drops); 1270 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1069 } 1271 kfree_skb(skb);
1070 goto drop; 1272 return -1;
1071 } 1273 }
1072 1274
1073 return 0; 1275 return 0;
1074 1276
1075drop:
1076 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1077 kfree_skb(skb);
1078 return -1;
1079} 1277}
1080 1278
1081/* returns: 1279/* returns:
@@ -1174,61 +1372,98 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1174 bh_lock_sock(sk); 1372 bh_lock_sock(sk);
1175 if (!sock_owned_by_user(sk)) 1373 if (!sock_owned_by_user(sk))
1176 rc = __udp_queue_rcv_skb(sk, skb); 1374 rc = __udp_queue_rcv_skb(sk, skb);
1177 else 1375 else if (sk_add_backlog(sk, skb)) {
1178 sk_add_backlog(sk, skb); 1376 bh_unlock_sock(sk);
1377 goto drop;
1378 }
1179 bh_unlock_sock(sk); 1379 bh_unlock_sock(sk);
1180 1380
1181 return rc; 1381 return rc;
1182 1382
1183drop: 1383drop:
1184 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1384 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1385 atomic_inc(&sk->sk_drops);
1185 kfree_skb(skb); 1386 kfree_skb(skb);
1186 return -1; 1387 return -1;
1187} 1388}
1188 1389
1390
1391static void flush_stack(struct sock **stack, unsigned int count,
1392 struct sk_buff *skb, unsigned int final)
1393{
1394 unsigned int i;
1395 struct sk_buff *skb1 = NULL;
1396 struct sock *sk;
1397
1398 for (i = 0; i < count; i++) {
1399 sk = stack[i];
1400 if (likely(skb1 == NULL))
1401 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
1402
1403 if (!skb1) {
1404 atomic_inc(&sk->sk_drops);
1405 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1406 IS_UDPLITE(sk));
1407 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
1408 IS_UDPLITE(sk));
1409 }
1410
1411 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1412 skb1 = NULL;
1413 }
1414 if (unlikely(skb1))
1415 kfree_skb(skb1);
1416}
1417
1189/* 1418/*
1190 * Multicasts and broadcasts go to each listener. 1419 * Multicasts and broadcasts go to each listener.
1191 * 1420 *
1192 * Note: called only from the BH handler context, 1421 * Note: called only from the BH handler context.
1193 * so we don't need to lock the hashes.
1194 */ 1422 */
1195static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1423static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1196 struct udphdr *uh, 1424 struct udphdr *uh,
1197 __be32 saddr, __be32 daddr, 1425 __be32 saddr, __be32 daddr,
1198 struct udp_table *udptable) 1426 struct udp_table *udptable)
1199{ 1427{
1200 struct sock *sk; 1428 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1201 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; 1429 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
1202 int dif; 1430 int dif;
1431 unsigned int i, count = 0;
1203 1432
1204 spin_lock(&hslot->lock); 1433 spin_lock(&hslot->lock);
1205 sk = sk_nulls_head(&hslot->head); 1434 sk = sk_nulls_head(&hslot->head);
1206 dif = skb->dev->ifindex; 1435 dif = skb->dev->ifindex;
1207 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1436 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1208 if (sk) { 1437 while (sk) {
1209 struct sock *sknext = NULL; 1438 stack[count++] = sk;
1439 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1440 daddr, uh->source, saddr, dif);
1441 if (unlikely(count == ARRAY_SIZE(stack))) {
1442 if (!sk)
1443 break;
1444 flush_stack(stack, count, skb, ~0);
1445 count = 0;
1446 }
1447 }
1448 /*
1449 * before releasing chain lock, we must take a reference on sockets
1450 */
1451 for (i = 0; i < count; i++)
1452 sock_hold(stack[i]);
1210 1453
1211 do {
1212 struct sk_buff *skb1 = skb;
1213
1214 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1215 daddr, uh->source, saddr,
1216 dif);
1217 if (sknext)
1218 skb1 = skb_clone(skb, GFP_ATOMIC);
1219
1220 if (skb1) {
1221 int ret = udp_queue_rcv_skb(sk, skb1);
1222 if (ret > 0)
1223 /* we should probably re-process instead
1224 * of dropping packets here. */
1225 kfree_skb(skb1);
1226 }
1227 sk = sknext;
1228 } while (sknext);
1229 } else
1230 consume_skb(skb);
1231 spin_unlock(&hslot->lock); 1454 spin_unlock(&hslot->lock);
1455
1456 /*
1457 * do the slow work with no lock held
1458 */
1459 if (count) {
1460 flush_stack(stack, count, skb, count - 1);
1461
1462 for (i = 0; i < count; i++)
1463 sock_put(stack[i]);
1464 } else {
1465 kfree_skb(skb);
1466 }
1232 return 0; 1467 return 0;
1233} 1468}
1234 1469
@@ -1292,6 +1527,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1292 1527
1293 uh = udp_hdr(skb); 1528 uh = udp_hdr(skb);
1294 ulen = ntohs(uh->len); 1529 ulen = ntohs(uh->len);
1530 saddr = ip_hdr(skb)->saddr;
1531 daddr = ip_hdr(skb)->daddr;
1532
1295 if (ulen > skb->len) 1533 if (ulen > skb->len)
1296 goto short_packet; 1534 goto short_packet;
1297 1535
@@ -1305,9 +1543,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1305 if (udp4_csum_init(skb, uh, proto)) 1543 if (udp4_csum_init(skb, uh, proto))
1306 goto csum_error; 1544 goto csum_error;
1307 1545
1308 saddr = ip_hdr(skb)->saddr;
1309 daddr = ip_hdr(skb)->daddr;
1310
1311 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1546 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1312 return __udp4_lib_mcast_deliver(net, skb, uh, 1547 return __udp4_lib_mcast_deliver(net, skb, uh,
1313 saddr, daddr, udptable); 1548 saddr, daddr, udptable);
@@ -1620,9 +1855,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
1620 struct udp_iter_state *state = seq->private; 1855 struct udp_iter_state *state = seq->private;
1621 struct net *net = seq_file_net(seq); 1856 struct net *net = seq_file_net(seq);
1622 1857
1623 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1858 for (state->bucket = start; state->bucket <= state->udp_table->mask;
1859 ++state->bucket) {
1624 struct hlist_nulls_node *node; 1860 struct hlist_nulls_node *node;
1625 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; 1861 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1862
1863 if (hlist_nulls_empty(&hslot->head))
1864 continue;
1865
1626 spin_lock_bh(&hslot->lock); 1866 spin_lock_bh(&hslot->lock);
1627 sk_nulls_for_each(sk, node, &hslot->head) { 1867 sk_nulls_for_each(sk, node, &hslot->head) {
1628 if (!net_eq(sock_net(sk), net)) 1868 if (!net_eq(sock_net(sk), net))
@@ -1647,7 +1887,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1647 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1887 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1648 1888
1649 if (!sk) { 1889 if (!sk) {
1650 if (state->bucket < UDP_HTABLE_SIZE) 1890 if (state->bucket <= state->udp_table->mask)
1651 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1891 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1652 return udp_get_first(seq, state->bucket + 1); 1892 return udp_get_first(seq, state->bucket + 1);
1653 } 1893 }
@@ -1667,7 +1907,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1667static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1907static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1668{ 1908{
1669 struct udp_iter_state *state = seq->private; 1909 struct udp_iter_state *state = seq->private;
1670 state->bucket = UDP_HTABLE_SIZE; 1910 state->bucket = MAX_UDP_PORTS;
1671 1911
1672 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1912 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1673} 1913}
@@ -1689,7 +1929,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
1689{ 1929{
1690 struct udp_iter_state *state = seq->private; 1930 struct udp_iter_state *state = seq->private;
1691 1931
1692 if (state->bucket < UDP_HTABLE_SIZE) 1932 if (state->bucket <= state->udp_table->mask)
1693 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1933 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1694} 1934}
1695 1935
@@ -1744,12 +1984,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
1744 int bucket, int *len) 1984 int bucket, int *len)
1745{ 1985{
1746 struct inet_sock *inet = inet_sk(sp); 1986 struct inet_sock *inet = inet_sk(sp);
1747 __be32 dest = inet->daddr; 1987 __be32 dest = inet->inet_daddr;
1748 __be32 src = inet->rcv_saddr; 1988 __be32 src = inet->inet_rcv_saddr;
1749 __u16 destp = ntohs(inet->dport); 1989 __u16 destp = ntohs(inet->inet_dport);
1750 __u16 srcp = ntohs(inet->sport); 1990 __u16 srcp = ntohs(inet->inet_sport);
1751 1991
1752 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 1992 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1753 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", 1993 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1754 bucket, src, srcp, dest, destp, sp->sk_state, 1994 bucket, src, srcp, dest, destp, sp->sk_state,
1755 sk_wmem_alloc_get(sp), 1995 sk_wmem_alloc_get(sp),
@@ -1789,12 +2029,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = {
1789 }, 2029 },
1790}; 2030};
1791 2031
1792static int udp4_proc_init_net(struct net *net) 2032static int __net_init udp4_proc_init_net(struct net *net)
1793{ 2033{
1794 return udp_proc_register(net, &udp4_seq_afinfo); 2034 return udp_proc_register(net, &udp4_seq_afinfo);
1795} 2035}
1796 2036
1797static void udp4_proc_exit_net(struct net *net) 2037static void __net_exit udp4_proc_exit_net(struct net *net)
1798{ 2038{
1799 udp_proc_unregister(net, &udp4_seq_afinfo); 2039 udp_proc_unregister(net, &udp4_seq_afinfo);
1800} 2040}
@@ -1815,21 +2055,60 @@ void udp4_proc_exit(void)
1815} 2055}
1816#endif /* CONFIG_PROC_FS */ 2056#endif /* CONFIG_PROC_FS */
1817 2057
1818void __init udp_table_init(struct udp_table *table) 2058static __initdata unsigned long uhash_entries;
2059static int __init set_uhash_entries(char *str)
1819{ 2060{
1820 int i; 2061 if (!str)
2062 return 0;
2063 uhash_entries = simple_strtoul(str, &str, 0);
2064 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2065 uhash_entries = UDP_HTABLE_SIZE_MIN;
2066 return 1;
2067}
2068__setup("uhash_entries=", set_uhash_entries);
1821 2069
1822 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 2070void __init udp_table_init(struct udp_table *table, const char *name)
2071{
2072 unsigned int i;
2073
2074 if (!CONFIG_BASE_SMALL)
2075 table->hash = alloc_large_system_hash(name,
2076 2 * sizeof(struct udp_hslot),
2077 uhash_entries,
2078 21, /* one slot per 2 MB */
2079 0,
2080 &table->log,
2081 &table->mask,
2082 64 * 1024);
2083 /*
2084 * Make sure hash table has the minimum size
2085 */
2086 if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
2087 table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
2088 2 * sizeof(struct udp_hslot), GFP_KERNEL);
2089 if (!table->hash)
2090 panic(name);
2091 table->log = ilog2(UDP_HTABLE_SIZE_MIN);
2092 table->mask = UDP_HTABLE_SIZE_MIN - 1;
2093 }
2094 table->hash2 = table->hash + (table->mask + 1);
2095 for (i = 0; i <= table->mask; i++) {
1823 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); 2096 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
2097 table->hash[i].count = 0;
1824 spin_lock_init(&table->hash[i].lock); 2098 spin_lock_init(&table->hash[i].lock);
1825 } 2099 }
2100 for (i = 0; i <= table->mask; i++) {
2101 INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
2102 table->hash2[i].count = 0;
2103 spin_lock_init(&table->hash2[i].lock);
2104 }
1826} 2105}
1827 2106
1828void __init udp_init(void) 2107void __init udp_init(void)
1829{ 2108{
1830 unsigned long nr_pages, limit; 2109 unsigned long nr_pages, limit;
1831 2110
1832 udp_table_init(&udp_table); 2111 udp_table_init(&udp_table, "UDP");
1833 /* Set the pressure threshold up by the same strategy of TCP. It is a 2112 /* Set the pressure threshold up by the same strategy of TCP. It is a
1834 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 2113 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1835 * toward zero with the amount of memory, with a floor of 128 pages. 2114 * toward zero with the amount of memory, with a floor of 128 pages.
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 95248d7f75ec..6610bf76369f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,7 +12,7 @@
12 */ 12 */
13#include "udp_impl.h" 13#include "udp_impl.h"
14 14
15struct udp_table udplite_table; 15struct udp_table udplite_table __read_mostly;
16EXPORT_SYMBOL(udplite_table); 16EXPORT_SYMBOL(udplite_table);
17 17
18static int udplite_rcv(struct sk_buff *skb) 18static int udplite_rcv(struct sk_buff *skb)
@@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = {
64 .protocol = IPPROTO_UDPLITE, 64 .protocol = IPPROTO_UDPLITE,
65 .prot = &udplite_prot, 65 .prot = &udplite_prot,
66 .ops = &inet_dgram_ops, 66 .ops = &inet_dgram_ops,
67 .capability = -1,
68 .no_check = 0, /* must checksum (RFC 3828) */ 67 .no_check = 0, /* must checksum (RFC 3828) */
69 .flags = INET_PROTOSW_PERMANENT, 68 .flags = INET_PROTOSW_PERMANENT,
70}; 69};
@@ -82,12 +81,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = {
82 }, 81 },
83}; 82};
84 83
85static int udplite4_proc_init_net(struct net *net) 84static int __net_init udplite4_proc_init_net(struct net *net)
86{ 85{
87 return udp_proc_register(net, &udplite4_seq_afinfo); 86 return udp_proc_register(net, &udplite4_seq_afinfo);
88} 87}
89 88
90static void udplite4_proc_exit_net(struct net *net) 89static void __net_exit udplite4_proc_exit_net(struct net *net)
91{ 90{
92 udp_proc_unregister(net, &udplite4_seq_afinfo); 91 udp_proc_unregister(net, &udplite4_seq_afinfo);
93} 92}
@@ -110,7 +109,7 @@ static inline int udplite4_proc_init(void)
110 109
111void __init udplite4_register(void) 110void __init udplite4_register(void)
112{ 111{
113 udp_table_init(&udplite_table); 112 udp_table_init(&udplite_table, "UDP-Lite");
114 if (proto_register(&udplite_prot, 1)) 113 if (proto_register(&udplite_prot, 1))
115 goto out_register_err; 114 goto out_register_err;
116 115
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f9f922a0ba88..c791bb63203f 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -9,6 +9,7 @@
9 * 9 *
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/string.h> 14#include <linux/string.h>
14#include <linux/netfilter.h> 15#include <linux/netfilter.h>
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 3444f3b34eca..6f368413eb0e 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -4,6 +4,7 @@
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> 4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */ 5 */
6 6
7#include <linux/gfp.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 74fb2eb833ec..e4a1483fba77 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -15,7 +15,6 @@
15#include <net/xfrm.h> 15#include <net/xfrm.h>
16#include <net/ip.h> 16#include <net/ip.h>
17 17
18static struct dst_ops xfrm4_dst_ops;
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 18static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 19
21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, 20static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
@@ -92,11 +91,12 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
92 return 0; 91 return 0;
93} 92}
94 93
95static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) 94static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
95 struct flowi *fl)
96{ 96{
97 struct rtable *rt = (struct rtable *)xdst->route; 97 struct rtable *rt = (struct rtable *)xdst->route;
98 98
99 xdst->u.rt.fl = rt->fl; 99 xdst->u.rt.fl = *fl;
100 100
101 xdst->u.dst.dev = dev; 101 xdst->u.dst.dev = dev;
102 dev_hold(dev); 102 dev_hold(dev);
@@ -190,8 +190,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
190 190
191static inline int xfrm4_garbage_collect(struct dst_ops *ops) 191static inline int xfrm4_garbage_collect(struct dst_ops *ops)
192{ 192{
193 xfrm4_policy_afinfo.garbage_collect(&init_net); 193 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
194 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 194
195 xfrm4_policy_afinfo.garbage_collect(net);
196 return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
195} 197}
196 198
197static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 199static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -267,9 +269,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
267#ifdef CONFIG_SYSCTL 269#ifdef CONFIG_SYSCTL
268static struct ctl_table xfrm4_policy_table[] = { 270static struct ctl_table xfrm4_policy_table[] = {
269 { 271 {
270 .ctl_name = CTL_UNNUMBERED,
271 .procname = "xfrm4_gc_thresh", 272 .procname = "xfrm4_gc_thresh",
272 .data = &xfrm4_dst_ops.gc_thresh, 273 .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
273 .maxlen = sizeof(int), 274 .maxlen = sizeof(int),
274 .mode = 0644, 275 .mode = 0644,
275 .proc_handler = proc_dointvec, 276 .proc_handler = proc_dointvec,
@@ -296,8 +297,6 @@ static void __exit xfrm4_policy_fini(void)
296 297
297void __init xfrm4_init(int rt_max_size) 298void __init xfrm4_init(int rt_max_size)
298{ 299{
299 xfrm4_state_init();
300 xfrm4_policy_init();
301 /* 300 /*
302 * Select a default value for the gc_thresh based on the main route 301 * Select a default value for the gc_thresh based on the main route
303 * table hash size. It seems to me the worst case scenario is when 302 * table hash size. It seems to me the worst case scenario is when
@@ -309,6 +308,9 @@ void __init xfrm4_init(int rt_max_size)
309 * and start cleaning when were 1/2 full 308 * and start cleaning when were 1/2 full
310 */ 309 */
311 xfrm4_dst_ops.gc_thresh = rt_max_size/2; 310 xfrm4_dst_ops.gc_thresh = rt_max_size/2;
311
312 xfrm4_state_init();
313 xfrm4_policy_init();
312#ifdef CONFIG_SYSCTL 314#ifdef CONFIG_SYSCTL
313 sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, 315 sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
314 xfrm4_policy_table); 316 xfrm4_policy_table);