aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 10:55:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-08 10:55:01 -0500
commitd7fc02c7bae7b1cf69269992cf880a43a350cdaa (patch)
treea43d56fa72913a1cc98a0bbebe054d08581b3a7c /net/ipv4
parentee1262dbc65ce0b6234a915d8432171e8d77f518 (diff)
parent28b4d5cc17c20786848cdc07b7ea237a309776bb (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1815 commits) mac80211: fix reorder buffer release iwmc3200wifi: Enable wimax core through module parameter iwmc3200wifi: Add wifi-wimax coexistence mode as a module parameter iwmc3200wifi: Coex table command does not expect a response iwmc3200wifi: Update wiwi priority table iwlwifi: driver version track kernel version iwlwifi: indicate uCode type when fail dump error/event log iwl3945: remove duplicated event logging code b43: fix two warnings ipw2100: fix rebooting hang with driver loaded cfg80211: indent regulatory messages with spaces iwmc3200wifi: fix NULL pointer dereference in pmkid update mac80211: Fix TX status reporting for injected data frames ath9k: enable 2GHz band only if the device supports it airo: Fix integer overflow warning rt2x00: Fix padding bug on L2PAD devices. WE: Fix set events not propagated b43legacy: avoid PPC fault during resume b43: avoid PPC fault during resume tcp: fix a timewait refcnt race ... Fix up conflicts due to sysctl cleanups (dead sysctl_check code and CTL_UNNUMBERED removed) in kernel/sysctl_check.c net/ipv4/sysctl_net_ipv4.c net/ipv6/addrconf.c net/sctp/sysctl.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c74
-rw-r--r--net/ipv4/ah4.c297
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/datagram.c18
-rw-r--r--net/ipv4/devinet.c158
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c50
-rw-r--r--net/ipv4/fib_hash.c25
-rw-r--r--net/ipv4/fib_rules.c16
-rw-r--r--net/ipv4/fib_semantics.c4
-rw-r--r--net/ipv4/fib_trie.c26
-rw-r--r--net/ipv4/icmp.c13
-rw-r--r--net/ipv4/igmp.c50
-rw-r--r--net/ipv4/inet_connection_sock.c27
-rw-r--r--net/ipv4/inet_diag.c28
-rw-r--r--net/ipv4/inet_hashtables.c49
-rw-r--r--net/ipv4/inet_lro.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c112
-rw-r--r--net/ipv4/inetpeer.c5
-rw-r--r--net/ipv4/ip_fragment.c11
-rw-r--r--net/ipv4/ip_gre.c88
-rw-r--r--net/ipv4/ip_input.c4
-rw-r--r--net/ipv4/ip_output.c23
-rw-r--r--net/ipv4/ip_sockglue.c12
-rw-r--r--net/ipv4/ipconfig.c13
-rw-r--r--net/ipv4/ipip.c97
-rw-r--r--net/ipv4/ipmr.c29
-rw-r--r--net/ipv4/netfilter.c8
-rw-r--r--net/ipv4/netfilter/arp_tables.c22
-rw-r--r--net/ipv4/netfilter/ip_queue.c7
-rw-r--r--net/ipv4/netfilter/ip_tables.c46
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c20
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c8
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c4
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c4
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c6
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c4
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c4
-rw-r--r--net/ipv4/netfilter/iptable_security.c4
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c28
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c22
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c10
-rw-r--r--net/ipv4/raw.c33
-rw-r--r--net/ipv4/route.c102
-rw-r--r--net/ipv4/syncookies.c33
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c289
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_input.c103
-rw-r--r--net/ipv4/tcp_ipv4.c223
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_minisocks.c76
-rw-r--r--net/ipv4/tcp_output.c329
-rw-r--r--net/ipv4/tcp_probe.c13
-rw-r--r--net/ipv4/tcp_timer.c12
-rw-r--r--net/ipv4/tcp_veno.c5
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/udp.c484
-rw-r--r--net/ipv4/udplite.c5
62 files changed, 2207 insertions, 1027 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 57737b8d1711..7d12c6a9b19b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -174,12 +174,12 @@ static int inet_autobind(struct sock *sk)
174 /* We may need to bind the socket. */ 174 /* We may need to bind the socket. */
175 lock_sock(sk); 175 lock_sock(sk);
176 inet = inet_sk(sk); 176 inet = inet_sk(sk);
177 if (!inet->num) { 177 if (!inet->inet_num) {
178 if (sk->sk_prot->get_port(sk, 0)) { 178 if (sk->sk_prot->get_port(sk, 0)) {
179 release_sock(sk); 179 release_sock(sk);
180 return -EAGAIN; 180 return -EAGAIN;
181 } 181 }
182 inet->sport = htons(inet->num); 182 inet->inet_sport = htons(inet->inet_num);
183 } 183 }
184 release_sock(sk); 184 release_sock(sk);
185 return 0; 185 return 0;
@@ -262,7 +262,8 @@ static inline int inet_netns_ok(struct net *net, int protocol)
262 * Create an inet socket. 262 * Create an inet socket.
263 */ 263 */
264 264
265static int inet_create(struct net *net, struct socket *sock, int protocol) 265static int inet_create(struct net *net, struct socket *sock, int protocol,
266 int kern)
266{ 267{
267 struct sock *sk; 268 struct sock *sk;
268 struct inet_protosw *answer; 269 struct inet_protosw *answer;
@@ -325,7 +326,7 @@ lookup_protocol:
325 } 326 }
326 327
327 err = -EPERM; 328 err = -EPERM;
328 if (answer->capability > 0 && !capable(answer->capability)) 329 if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
329 goto out_rcu_unlock; 330 goto out_rcu_unlock;
330 331
331 err = -EAFNOSUPPORT; 332 err = -EAFNOSUPPORT;
@@ -354,7 +355,7 @@ lookup_protocol:
354 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 355 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
355 356
356 if (SOCK_RAW == sock->type) { 357 if (SOCK_RAW == sock->type) {
357 inet->num = protocol; 358 inet->inet_num = protocol;
358 if (IPPROTO_RAW == protocol) 359 if (IPPROTO_RAW == protocol)
359 inet->hdrincl = 1; 360 inet->hdrincl = 1;
360 } 361 }
@@ -364,7 +365,7 @@ lookup_protocol:
364 else 365 else
365 inet->pmtudisc = IP_PMTUDISC_WANT; 366 inet->pmtudisc = IP_PMTUDISC_WANT;
366 367
367 inet->id = 0; 368 inet->inet_id = 0;
368 369
369 sock_init_data(sock, sk); 370 sock_init_data(sock, sk);
370 371
@@ -381,13 +382,13 @@ lookup_protocol:
381 382
382 sk_refcnt_debug_inc(sk); 383 sk_refcnt_debug_inc(sk);
383 384
384 if (inet->num) { 385 if (inet->inet_num) {
385 /* It assumes that any protocol which allows 386 /* It assumes that any protocol which allows
386 * the user to assign a number at socket 387 * the user to assign a number at socket
387 * creation time automatically 388 * creation time automatically
388 * shares. 389 * shares.
389 */ 390 */
390 inet->sport = htons(inet->num); 391 inet->inet_sport = htons(inet->inet_num);
391 /* Add to protocol hash chains. */ 392 /* Add to protocol hash chains. */
392 sk->sk_prot->hash(sk); 393 sk->sk_prot->hash(sk);
393 } 394 }
@@ -494,27 +495,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
494 495
495 /* Check these errors (active socket, double bind). */ 496 /* Check these errors (active socket, double bind). */
496 err = -EINVAL; 497 err = -EINVAL;
497 if (sk->sk_state != TCP_CLOSE || inet->num) 498 if (sk->sk_state != TCP_CLOSE || inet->inet_num)
498 goto out_release_sock; 499 goto out_release_sock;
499 500
500 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 501 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
501 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 502 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
502 inet->saddr = 0; /* Use device */ 503 inet->inet_saddr = 0; /* Use device */
503 504
504 /* Make sure we are allowed to bind here. */ 505 /* Make sure we are allowed to bind here. */
505 if (sk->sk_prot->get_port(sk, snum)) { 506 if (sk->sk_prot->get_port(sk, snum)) {
506 inet->saddr = inet->rcv_saddr = 0; 507 inet->inet_saddr = inet->inet_rcv_saddr = 0;
507 err = -EADDRINUSE; 508 err = -EADDRINUSE;
508 goto out_release_sock; 509 goto out_release_sock;
509 } 510 }
510 511
511 if (inet->rcv_saddr) 512 if (inet->inet_rcv_saddr)
512 sk->sk_userlocks |= SOCK_BINDADDR_LOCK; 513 sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
513 if (snum) 514 if (snum)
514 sk->sk_userlocks |= SOCK_BINDPORT_LOCK; 515 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
515 inet->sport = htons(inet->num); 516 inet->inet_sport = htons(inet->inet_num);
516 inet->daddr = 0; 517 inet->inet_daddr = 0;
517 inet->dport = 0; 518 inet->inet_dport = 0;
518 sk_dst_reset(sk); 519 sk_dst_reset(sk);
519 err = 0; 520 err = 0;
520out_release_sock: 521out_release_sock:
@@ -532,7 +533,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
532 if (uaddr->sa_family == AF_UNSPEC) 533 if (uaddr->sa_family == AF_UNSPEC)
533 return sk->sk_prot->disconnect(sk, flags); 534 return sk->sk_prot->disconnect(sk, flags);
534 535
535 if (!inet_sk(sk)->num && inet_autobind(sk)) 536 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
536 return -EAGAIN; 537 return -EAGAIN;
537 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); 538 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
538} 539}
@@ -685,21 +686,21 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
685{ 686{
686 struct sock *sk = sock->sk; 687 struct sock *sk = sock->sk;
687 struct inet_sock *inet = inet_sk(sk); 688 struct inet_sock *inet = inet_sk(sk);
688 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 689 DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
689 690
690 sin->sin_family = AF_INET; 691 sin->sin_family = AF_INET;
691 if (peer) { 692 if (peer) {
692 if (!inet->dport || 693 if (!inet->inet_dport ||
693 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && 694 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
694 peer == 1)) 695 peer == 1))
695 return -ENOTCONN; 696 return -ENOTCONN;
696 sin->sin_port = inet->dport; 697 sin->sin_port = inet->inet_dport;
697 sin->sin_addr.s_addr = inet->daddr; 698 sin->sin_addr.s_addr = inet->inet_daddr;
698 } else { 699 } else {
699 __be32 addr = inet->rcv_saddr; 700 __be32 addr = inet->inet_rcv_saddr;
700 if (!addr) 701 if (!addr)
701 addr = inet->saddr; 702 addr = inet->inet_saddr;
702 sin->sin_port = inet->sport; 703 sin->sin_port = inet->inet_sport;
703 sin->sin_addr.s_addr = addr; 704 sin->sin_addr.s_addr = addr;
704 } 705 }
705 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 706 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
@@ -714,7 +715,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
714 struct sock *sk = sock->sk; 715 struct sock *sk = sock->sk;
715 716
716 /* We may need to bind the socket. */ 717 /* We may need to bind the socket. */
717 if (!inet_sk(sk)->num && inet_autobind(sk)) 718 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
718 return -EAGAIN; 719 return -EAGAIN;
719 720
720 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 721 return sk->sk_prot->sendmsg(iocb, sk, msg, size);
@@ -728,7 +729,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
728 struct sock *sk = sock->sk; 729 struct sock *sk = sock->sk;
729 730
730 /* We may need to bind the socket. */ 731 /* We may need to bind the socket. */
731 if (!inet_sk(sk)->num && inet_autobind(sk)) 732 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
732 return -EAGAIN; 733 return -EAGAIN;
733 734
734 if (sk->sk_prot->sendpage) 735 if (sk->sk_prot->sendpage)
@@ -931,7 +932,7 @@ static const struct proto_ops inet_sockraw_ops = {
931#endif 932#endif
932}; 933};
933 934
934static struct net_proto_family inet_family_ops = { 935static const struct net_proto_family inet_family_ops = {
935 .family = PF_INET, 936 .family = PF_INET,
936 .create = inet_create, 937 .create = inet_create,
937 .owner = THIS_MODULE, 938 .owner = THIS_MODULE,
@@ -947,7 +948,6 @@ static struct inet_protosw inetsw_array[] =
947 .protocol = IPPROTO_TCP, 948 .protocol = IPPROTO_TCP,
948 .prot = &tcp_prot, 949 .prot = &tcp_prot,
949 .ops = &inet_stream_ops, 950 .ops = &inet_stream_ops,
950 .capability = -1,
951 .no_check = 0, 951 .no_check = 0,
952 .flags = INET_PROTOSW_PERMANENT | 952 .flags = INET_PROTOSW_PERMANENT |
953 INET_PROTOSW_ICSK, 953 INET_PROTOSW_ICSK,
@@ -958,7 +958,6 @@ static struct inet_protosw inetsw_array[] =
958 .protocol = IPPROTO_UDP, 958 .protocol = IPPROTO_UDP,
959 .prot = &udp_prot, 959 .prot = &udp_prot,
960 .ops = &inet_dgram_ops, 960 .ops = &inet_dgram_ops,
961 .capability = -1,
962 .no_check = UDP_CSUM_DEFAULT, 961 .no_check = UDP_CSUM_DEFAULT,
963 .flags = INET_PROTOSW_PERMANENT, 962 .flags = INET_PROTOSW_PERMANENT,
964 }, 963 },
@@ -969,7 +968,6 @@ static struct inet_protosw inetsw_array[] =
969 .protocol = IPPROTO_IP, /* wild card */ 968 .protocol = IPPROTO_IP, /* wild card */
970 .prot = &raw_prot, 969 .prot = &raw_prot,
971 .ops = &inet_sockraw_ops, 970 .ops = &inet_sockraw_ops,
972 .capability = CAP_NET_RAW,
973 .no_check = UDP_CSUM_DEFAULT, 971 .no_check = UDP_CSUM_DEFAULT,
974 .flags = INET_PROTOSW_REUSE, 972 .flags = INET_PROTOSW_REUSE,
975 } 973 }
@@ -1059,9 +1057,9 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1059 struct inet_sock *inet = inet_sk(sk); 1057 struct inet_sock *inet = inet_sk(sk);
1060 int err; 1058 int err;
1061 struct rtable *rt; 1059 struct rtable *rt;
1062 __be32 old_saddr = inet->saddr; 1060 __be32 old_saddr = inet->inet_saddr;
1063 __be32 new_saddr; 1061 __be32 new_saddr;
1064 __be32 daddr = inet->daddr; 1062 __be32 daddr = inet->inet_daddr;
1065 1063
1066 if (inet->opt && inet->opt->srr) 1064 if (inet->opt && inet->opt->srr)
1067 daddr = inet->opt->faddr; 1065 daddr = inet->opt->faddr;
@@ -1071,7 +1069,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1071 RT_CONN_FLAGS(sk), 1069 RT_CONN_FLAGS(sk),
1072 sk->sk_bound_dev_if, 1070 sk->sk_bound_dev_if,
1073 sk->sk_protocol, 1071 sk->sk_protocol,
1074 inet->sport, inet->dport, sk, 0); 1072 inet->inet_sport, inet->inet_dport, sk, 0);
1075 if (err) 1073 if (err)
1076 return err; 1074 return err;
1077 1075
@@ -1087,7 +1085,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1087 __func__, &old_saddr, &new_saddr); 1085 __func__, &old_saddr, &new_saddr);
1088 } 1086 }
1089 1087
1090 inet->saddr = inet->rcv_saddr = new_saddr; 1088 inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
1091 1089
1092 /* 1090 /*
1093 * XXX The only one ugly spot where we need to 1091 * XXX The only one ugly spot where we need to
@@ -1113,7 +1111,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1113 return 0; 1111 return 0;
1114 1112
1115 /* Reroute. */ 1113 /* Reroute. */
1116 daddr = inet->daddr; 1114 daddr = inet->inet_daddr;
1117 if (inet->opt && inet->opt->srr) 1115 if (inet->opt && inet->opt->srr)
1118 daddr = inet->opt->faddr; 1116 daddr = inet->opt->faddr;
1119{ 1117{
@@ -1123,7 +1121,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1123 .nl_u = { 1121 .nl_u = {
1124 .ip4_u = { 1122 .ip4_u = {
1125 .daddr = daddr, 1123 .daddr = daddr,
1126 .saddr = inet->saddr, 1124 .saddr = inet->inet_saddr,
1127 .tos = RT_CONN_FLAGS(sk), 1125 .tos = RT_CONN_FLAGS(sk),
1128 }, 1126 },
1129 }, 1127 },
@@ -1131,8 +1129,8 @@ int inet_sk_rebuild_header(struct sock *sk)
1131 .flags = inet_sk_flowi_flags(sk), 1129 .flags = inet_sk_flowi_flags(sk),
1132 .uli_u = { 1130 .uli_u = {
1133 .ports = { 1131 .ports = {
1134 .sport = inet->sport, 1132 .sport = inet->inet_sport,
1135 .dport = inet->dport, 1133 .dport = inet->inet_dport,
1136 }, 1134 },
1137 }, 1135 },
1138 }; 1136 };
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5c662703eb1e..7ed3e4ae93ae 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,3 +1,4 @@
1#include <crypto/hash.h>
1#include <linux/err.h> 2#include <linux/err.h>
2#include <linux/module.h> 3#include <linux/module.h>
3#include <net/ip.h> 4#include <net/ip.h>
@@ -5,10 +6,67 @@
5#include <net/ah.h> 6#include <net/ah.h>
6#include <linux/crypto.h> 7#include <linux/crypto.h>
7#include <linux/pfkeyv2.h> 8#include <linux/pfkeyv2.h>
8#include <linux/spinlock.h> 9#include <linux/scatterlist.h>
9#include <net/icmp.h> 10#include <net/icmp.h>
10#include <net/protocol.h> 11#include <net/protocol.h>
11 12
13struct ah_skb_cb {
14 struct xfrm_skb_cb xfrm;
15 void *tmp;
16};
17
18#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
19
20static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
21 unsigned int size)
22{
23 unsigned int len;
24
25 len = size + crypto_ahash_digestsize(ahash) +
26 (crypto_ahash_alignmask(ahash) &
27 ~(crypto_tfm_ctx_alignment() - 1));
28
29 len = ALIGN(len, crypto_tfm_ctx_alignment());
30
31 len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
32 len = ALIGN(len, __alignof__(struct scatterlist));
33
34 len += sizeof(struct scatterlist) * nfrags;
35
36 return kmalloc(len, GFP_ATOMIC);
37}
38
39static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
40{
41 return tmp + offset;
42}
43
44static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
45 unsigned int offset)
46{
47 return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
48}
49
50static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
51 u8 *icv)
52{
53 struct ahash_request *req;
54
55 req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
56 crypto_tfm_ctx_alignment());
57
58 ahash_request_set_tfm(req, ahash);
59
60 return req;
61}
62
63static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
64 struct ahash_request *req)
65{
66 return (void *)ALIGN((unsigned long)(req + 1) +
67 crypto_ahash_reqsize(ahash),
68 __alignof__(struct scatterlist));
69}
12 70
13/* Clear mutable options and find final destination to substitute 71/* Clear mutable options and find final destination to substitute
14 * into IP header for icv calculation. Options are already checked 72 * into IP header for icv calculation. Options are already checked
@@ -54,20 +112,72 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
54 return 0; 112 return 0;
55} 113}
56 114
115static void ah_output_done(struct crypto_async_request *base, int err)
116{
117 u8 *icv;
118 struct iphdr *iph;
119 struct sk_buff *skb = base->data;
120 struct xfrm_state *x = skb_dst(skb)->xfrm;
121 struct ah_data *ahp = x->data;
122 struct iphdr *top_iph = ip_hdr(skb);
123 struct ip_auth_hdr *ah = ip_auth_hdr(skb);
124 int ihl = ip_hdrlen(skb);
125
126 iph = AH_SKB_CB(skb)->tmp;
127 icv = ah_tmp_icv(ahp->ahash, iph, ihl);
128 memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
129
130 top_iph->tos = iph->tos;
131 top_iph->ttl = iph->ttl;
132 top_iph->frag_off = iph->frag_off;
133 if (top_iph->ihl != 5) {
134 top_iph->daddr = iph->daddr;
135 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
136 }
137
138 err = ah->nexthdr;
139
140 kfree(AH_SKB_CB(skb)->tmp);
141 xfrm_output_resume(skb, err);
142}
143
57static int ah_output(struct xfrm_state *x, struct sk_buff *skb) 144static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
58{ 145{
59 int err; 146 int err;
147 int nfrags;
148 int ihl;
149 u8 *icv;
150 struct sk_buff *trailer;
151 struct crypto_ahash *ahash;
152 struct ahash_request *req;
153 struct scatterlist *sg;
60 struct iphdr *iph, *top_iph; 154 struct iphdr *iph, *top_iph;
61 struct ip_auth_hdr *ah; 155 struct ip_auth_hdr *ah;
62 struct ah_data *ahp; 156 struct ah_data *ahp;
63 union { 157
64 struct iphdr iph; 158 ahp = x->data;
65 char buf[60]; 159 ahash = ahp->ahash;
66 } tmp_iph; 160
161 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
162 goto out;
163 nfrags = err;
67 164
68 skb_push(skb, -skb_network_offset(skb)); 165 skb_push(skb, -skb_network_offset(skb));
166 ah = ip_auth_hdr(skb);
167 ihl = ip_hdrlen(skb);
168
169 err = -ENOMEM;
170 iph = ah_alloc_tmp(ahash, nfrags, ihl);
171 if (!iph)
172 goto out;
173
174 icv = ah_tmp_icv(ahash, iph, ihl);
175 req = ah_tmp_req(ahash, icv);
176 sg = ah_req_sg(ahash, req);
177
178 memset(ah->auth_data, 0, ahp->icv_trunc_len);
179
69 top_iph = ip_hdr(skb); 180 top_iph = ip_hdr(skb);
70 iph = &tmp_iph.iph;
71 181
72 iph->tos = top_iph->tos; 182 iph->tos = top_iph->tos;
73 iph->ttl = top_iph->ttl; 183 iph->ttl = top_iph->ttl;
@@ -78,10 +188,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
78 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); 188 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
79 err = ip_clear_mutable_options(top_iph, &top_iph->daddr); 189 err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
80 if (err) 190 if (err)
81 goto error; 191 goto out_free;
82 } 192 }
83 193
84 ah = ip_auth_hdr(skb);
85 ah->nexthdr = *skb_mac_header(skb); 194 ah->nexthdr = *skb_mac_header(skb);
86 *skb_mac_header(skb) = IPPROTO_AH; 195 *skb_mac_header(skb) = IPPROTO_AH;
87 196
@@ -91,20 +200,31 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
91 top_iph->ttl = 0; 200 top_iph->ttl = 0;
92 top_iph->check = 0; 201 top_iph->check = 0;
93 202
94 ahp = x->data;
95 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 203 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
96 204
97 ah->reserved = 0; 205 ah->reserved = 0;
98 ah->spi = x->id.spi; 206 ah->spi = x->id.spi;
99 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 207 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
100 208
101 spin_lock_bh(&x->lock); 209 sg_init_table(sg, nfrags);
102 err = ah_mac_digest(ahp, skb, ah->auth_data); 210 skb_to_sgvec(skb, sg, 0, skb->len);
103 memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
104 spin_unlock_bh(&x->lock);
105 211
106 if (err) 212 ahash_request_set_crypt(req, sg, icv, skb->len);
107 goto error; 213 ahash_request_set_callback(req, 0, ah_output_done, skb);
214
215 AH_SKB_CB(skb)->tmp = iph;
216
217 err = crypto_ahash_digest(req);
218 if (err) {
219 if (err == -EINPROGRESS)
220 goto out;
221
222 if (err == -EBUSY)
223 err = NET_XMIT_DROP;
224 goto out_free;
225 }
226
227 memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
108 228
109 top_iph->tos = iph->tos; 229 top_iph->tos = iph->tos;
110 top_iph->ttl = iph->ttl; 230 top_iph->ttl = iph->ttl;
@@ -114,28 +234,67 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
114 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); 234 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
115 } 235 }
116 236
117 err = 0; 237out_free:
118 238 kfree(iph);
119error: 239out:
120 return err; 240 return err;
121} 241}
122 242
243static void ah_input_done(struct crypto_async_request *base, int err)
244{
245 u8 *auth_data;
246 u8 *icv;
247 struct iphdr *work_iph;
248 struct sk_buff *skb = base->data;
249 struct xfrm_state *x = xfrm_input_state(skb);
250 struct ah_data *ahp = x->data;
251 struct ip_auth_hdr *ah = ip_auth_hdr(skb);
252 int ihl = ip_hdrlen(skb);
253 int ah_hlen = (ah->hdrlen + 2) << 2;
254
255 work_iph = AH_SKB_CB(skb)->tmp;
256 auth_data = ah_tmp_auth(work_iph, ihl);
257 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
258
259 err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
260 if (err)
261 goto out;
262
263 skb->network_header += ah_hlen;
264 memcpy(skb_network_header(skb), work_iph, ihl);
265 __skb_pull(skb, ah_hlen + ihl);
266 skb_set_transport_header(skb, -ihl);
267
268 err = ah->nexthdr;
269out:
270 kfree(AH_SKB_CB(skb)->tmp);
271 xfrm_input_resume(skb, err);
272}
273
123static int ah_input(struct xfrm_state *x, struct sk_buff *skb) 274static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
124{ 275{
125 int ah_hlen; 276 int ah_hlen;
126 int ihl; 277 int ihl;
127 int nexthdr; 278 int nexthdr;
128 int err = -EINVAL; 279 int nfrags;
129 struct iphdr *iph; 280 u8 *auth_data;
281 u8 *icv;
282 struct sk_buff *trailer;
283 struct crypto_ahash *ahash;
284 struct ahash_request *req;
285 struct scatterlist *sg;
286 struct iphdr *iph, *work_iph;
130 struct ip_auth_hdr *ah; 287 struct ip_auth_hdr *ah;
131 struct ah_data *ahp; 288 struct ah_data *ahp;
132 char work_buf[60]; 289 int err = -ENOMEM;
133 290
134 if (!pskb_may_pull(skb, sizeof(*ah))) 291 if (!pskb_may_pull(skb, sizeof(*ah)))
135 goto out; 292 goto out;
136 293
137 ah = (struct ip_auth_hdr *)skb->data; 294 ah = (struct ip_auth_hdr *)skb->data;
138 ahp = x->data; 295 ahp = x->data;
296 ahash = ahp->ahash;
297
139 nexthdr = ah->nexthdr; 298 nexthdr = ah->nexthdr;
140 ah_hlen = (ah->hdrlen + 2) << 2; 299 ah_hlen = (ah->hdrlen + 2) << 2;
141 300
@@ -156,9 +315,24 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
156 315
157 ah = (struct ip_auth_hdr *)skb->data; 316 ah = (struct ip_auth_hdr *)skb->data;
158 iph = ip_hdr(skb); 317 iph = ip_hdr(skb);
318 ihl = ip_hdrlen(skb);
319
320 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
321 goto out;
322 nfrags = err;
323
324 work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
325 if (!work_iph)
326 goto out;
327
328 auth_data = ah_tmp_auth(work_iph, ihl);
329 icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
330 req = ah_tmp_req(ahash, icv);
331 sg = ah_req_sg(ahash, req);
159 332
160 ihl = skb->data - skb_network_header(skb); 333 memcpy(work_iph, iph, ihl);
161 memcpy(work_buf, iph, ihl); 334 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
335 memset(ah->auth_data, 0, ahp->icv_trunc_len);
162 336
163 iph->ttl = 0; 337 iph->ttl = 0;
164 iph->tos = 0; 338 iph->tos = 0;
@@ -166,35 +340,44 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
166 iph->check = 0; 340 iph->check = 0;
167 if (ihl > sizeof(*iph)) { 341 if (ihl > sizeof(*iph)) {
168 __be32 dummy; 342 __be32 dummy;
169 if (ip_clear_mutable_options(iph, &dummy)) 343 err = ip_clear_mutable_options(iph, &dummy);
170 goto out; 344 if (err)
345 goto out_free;
171 } 346 }
172 347
173 spin_lock(&x->lock); 348 skb_push(skb, ihl);
174 {
175 u8 auth_data[MAX_AH_AUTH_LEN];
176 349
177 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 350 sg_init_table(sg, nfrags);
178 skb_push(skb, ihl); 351 skb_to_sgvec(skb, sg, 0, skb->len);
179 err = ah_mac_digest(ahp, skb, ah->auth_data); 352
180 if (err) 353 ahash_request_set_crypt(req, sg, icv, skb->len);
181 goto unlock; 354 ahash_request_set_callback(req, 0, ah_input_done, skb);
182 if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) 355
183 err = -EBADMSG; 356 AH_SKB_CB(skb)->tmp = work_iph;
357
358 err = crypto_ahash_digest(req);
359 if (err) {
360 if (err == -EINPROGRESS)
361 goto out;
362
363 if (err == -EBUSY)
364 err = NET_XMIT_DROP;
365 goto out_free;
184 } 366 }
185unlock:
186 spin_unlock(&x->lock);
187 367
368 err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
188 if (err) 369 if (err)
189 goto out; 370 goto out_free;
190 371
191 skb->network_header += ah_hlen; 372 skb->network_header += ah_hlen;
192 memcpy(skb_network_header(skb), work_buf, ihl); 373 memcpy(skb_network_header(skb), work_iph, ihl);
193 skb->transport_header = skb->network_header;
194 __skb_pull(skb, ah_hlen + ihl); 374 __skb_pull(skb, ah_hlen + ihl);
375 skb_set_transport_header(skb, -ihl);
195 376
196 return nexthdr; 377 err = nexthdr;
197 378
379out_free:
380 kfree (work_iph);
198out: 381out:
199 return err; 382 return err;
200} 383}
@@ -222,7 +405,7 @@ static int ah_init_state(struct xfrm_state *x)
222{ 405{
223 struct ah_data *ahp = NULL; 406 struct ah_data *ahp = NULL;
224 struct xfrm_algo_desc *aalg_desc; 407 struct xfrm_algo_desc *aalg_desc;
225 struct crypto_hash *tfm; 408 struct crypto_ahash *ahash;
226 409
227 if (!x->aalg) 410 if (!x->aalg)
228 goto error; 411 goto error;
@@ -231,44 +414,40 @@ static int ah_init_state(struct xfrm_state *x)
231 goto error; 414 goto error;
232 415
233 ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); 416 ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
234 if (ahp == NULL) 417 if (!ahp)
235 return -ENOMEM; 418 return -ENOMEM;
236 419
237 tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); 420 ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
238 if (IS_ERR(tfm)) 421 if (IS_ERR(ahash))
239 goto error; 422 goto error;
240 423
241 ahp->tfm = tfm; 424 ahp->ahash = ahash;
242 if (crypto_hash_setkey(tfm, x->aalg->alg_key, 425 if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
243 (x->aalg->alg_key_len + 7) / 8)) 426 (x->aalg->alg_key_len + 7) / 8))
244 goto error; 427 goto error;
245 428
246 /* 429 /*
247 * Lookup the algorithm description maintained by xfrm_algo, 430 * Lookup the algorithm description maintained by xfrm_algo,
248 * verify crypto transform properties, and store information 431 * verify crypto transform properties, and store information
249 * we need for AH processing. This lookup cannot fail here 432 * we need for AH processing. This lookup cannot fail here
250 * after a successful crypto_alloc_hash(). 433 * after a successful crypto_alloc_ahash().
251 */ 434 */
252 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); 435 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
253 BUG_ON(!aalg_desc); 436 BUG_ON(!aalg_desc);
254 437
255 if (aalg_desc->uinfo.auth.icv_fullbits/8 != 438 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
256 crypto_hash_digestsize(tfm)) { 439 crypto_ahash_digestsize(ahash)) {
257 printk(KERN_INFO "AH: %s digestsize %u != %hu\n", 440 printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
258 x->aalg->alg_name, crypto_hash_digestsize(tfm), 441 x->aalg->alg_name, crypto_ahash_digestsize(ahash),
259 aalg_desc->uinfo.auth.icv_fullbits/8); 442 aalg_desc->uinfo.auth.icv_fullbits/8);
260 goto error; 443 goto error;
261 } 444 }
262 445
263 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; 446 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
264 ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; 447 ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
265 448
266 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 449 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
267 450
268 ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
269 if (!ahp->work_icv)
270 goto error;
271
272 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 451 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
273 ahp->icv_trunc_len); 452 ahp->icv_trunc_len);
274 if (x->props.mode == XFRM_MODE_TUNNEL) 453 if (x->props.mode == XFRM_MODE_TUNNEL)
@@ -279,8 +458,7 @@ static int ah_init_state(struct xfrm_state *x)
279 458
280error: 459error:
281 if (ahp) { 460 if (ahp) {
282 kfree(ahp->work_icv); 461 crypto_free_ahash(ahp->ahash);
283 crypto_free_hash(ahp->tfm);
284 kfree(ahp); 462 kfree(ahp);
285 } 463 }
286 return -EINVAL; 464 return -EINVAL;
@@ -293,8 +471,7 @@ static void ah_destroy(struct xfrm_state *x)
293 if (!ahp) 471 if (!ahp)
294 return; 472 return;
295 473
296 kfree(ahp->work_icv); 474 crypto_free_ahash(ahp->ahash);
297 crypto_free_hash(ahp->tfm);
298 kfree(ahp); 475 kfree(ahp);
299} 476}
300 477
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 039cc1ffe977..1e029dc75455 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -2017,7 +2017,7 @@ req_setattr_failure:
2017 * values on failure. 2017 * values on failure.
2018 * 2018 *
2019 */ 2019 */
2020int cipso_v4_delopt(struct ip_options **opt_ptr) 2020static int cipso_v4_delopt(struct ip_options **opt_ptr)
2021{ 2021{
2022 int hdr_delta = 0; 2022 int hdr_delta = 0;
2023 struct ip_options *opt = *opt_ptr; 2023 struct ip_options *opt = *opt_ptr;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 5e6c5a0f3fde..fb2465811b48 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
39 sk_dst_reset(sk); 39 sk_dst_reset(sk);
40 40
41 oif = sk->sk_bound_dev_if; 41 oif = sk->sk_bound_dev_if;
42 saddr = inet->saddr; 42 saddr = inet->inet_saddr;
43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) { 43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
44 if (!oif) 44 if (!oif)
45 oif = inet->mc_index; 45 oif = inet->mc_index;
@@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif, 50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol, 51 sk->sk_protocol,
52 inet->sport, usin->sin_port, sk, 1); 52 inet->inet_sport, usin->sin_port, sk, 1);
53 if (err) { 53 if (err) {
54 if (err == -ENETUNREACH) 54 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -60,14 +60,14 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
60 ip_rt_put(rt); 60 ip_rt_put(rt);
61 return -EACCES; 61 return -EACCES;
62 } 62 }
63 if (!inet->saddr) 63 if (!inet->inet_saddr)
64 inet->saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->rcv_saddr) 65 if (!inet->inet_rcv_saddr)
66 inet->rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 inet->daddr = rt->rt_dst; 67 inet->inet_daddr = rt->rt_dst;
68 inet->dport = usin->sin_port; 68 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 69 sk->sk_state = TCP_ESTABLISHED;
70 inet->id = jiffies; 70 inet->inet_id = jiffies;
71 71
72 sk_dst_set(sk, &rt->u.dst); 72 sk_dst_set(sk, &rt->u.dst);
73 return(0); 73 return(0);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e049da8311b5..5cdbc102a418 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -140,11 +140,11 @@ void in_dev_finish_destroy(struct in_device *idev)
140#endif 140#endif
141 dev_put(dev); 141 dev_put(dev);
142 if (!idev->dead) 142 if (!idev->dead)
143 printk("Freeing alive in_device %p\n", idev); 143 pr_err("Freeing alive in_device %p\n", idev);
144 else { 144 else
145 kfree(idev); 145 kfree(idev);
146 }
147} 146}
147EXPORT_SYMBOL(in_dev_finish_destroy);
148 148
149static struct in_device *inetdev_init(struct net_device *dev) 149static struct in_device *inetdev_init(struct net_device *dev)
150{ 150{
@@ -159,7 +159,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
159 sizeof(in_dev->cnf)); 159 sizeof(in_dev->cnf));
160 in_dev->cnf.sysctl = NULL; 160 in_dev->cnf.sysctl = NULL;
161 in_dev->dev = dev; 161 in_dev->dev = dev;
162 if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) 162 in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
163 if (!in_dev->arp_parms)
163 goto out_kfree; 164 goto out_kfree;
164 if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) 165 if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
165 dev_disable_lro(dev); 166 dev_disable_lro(dev);
@@ -405,13 +406,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
405{ 406{
406 struct net_device *dev; 407 struct net_device *dev;
407 struct in_device *in_dev = NULL; 408 struct in_device *in_dev = NULL;
408 read_lock(&dev_base_lock); 409
409 dev = __dev_get_by_index(net, ifindex); 410 rcu_read_lock();
411 dev = dev_get_by_index_rcu(net, ifindex);
410 if (dev) 412 if (dev)
411 in_dev = in_dev_get(dev); 413 in_dev = in_dev_get(dev);
412 read_unlock(&dev_base_lock); 414 rcu_read_unlock();
413 return in_dev; 415 return in_dev;
414} 416}
417EXPORT_SYMBOL(inetdev_by_index);
415 418
416/* Called only from RTNL semaphored context. No locks. */ 419/* Called only from RTNL semaphored context. No locks. */
417 420
@@ -557,7 +560,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
557 * Determine a default network mask, based on the IP address. 560 * Determine a default network mask, based on the IP address.
558 */ 561 */
559 562
560static __inline__ int inet_abc_len(__be32 addr) 563static inline int inet_abc_len(__be32 addr)
561{ 564{
562 int rc = -1; /* Something else, probably a multicast. */ 565 int rc = -1; /* Something else, probably a multicast. */
563 566
@@ -646,13 +649,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
646 rtnl_lock(); 649 rtnl_lock();
647 650
648 ret = -ENODEV; 651 ret = -ENODEV;
649 if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) 652 dev = __dev_get_by_name(net, ifr.ifr_name);
653 if (!dev)
650 goto done; 654 goto done;
651 655
652 if (colon) 656 if (colon)
653 *colon = ':'; 657 *colon = ':';
654 658
655 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 659 in_dev = __in_dev_get_rtnl(dev);
660 if (in_dev) {
656 if (tryaddrmatch) { 661 if (tryaddrmatch) {
657 /* Matthias Andree */ 662 /* Matthias Andree */
658 /* compare label and address (4.4BSD style) */ 663 /* compare label and address (4.4BSD style) */
@@ -720,7 +725,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
720 725
721 if (!ifa) { 726 if (!ifa) {
722 ret = -ENOBUFS; 727 ret = -ENOBUFS;
723 if ((ifa = inet_alloc_ifa()) == NULL) 728 ifa = inet_alloc_ifa();
729 if (!ifa)
724 break; 730 break;
725 if (colon) 731 if (colon)
726 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); 732 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
@@ -822,10 +828,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
822 struct ifreq ifr; 828 struct ifreq ifr;
823 int done = 0; 829 int done = 0;
824 830
825 if (!in_dev || (ifa = in_dev->ifa_list) == NULL) 831 if (!in_dev)
826 goto out; 832 goto out;
827 833
828 for (; ifa; ifa = ifa->ifa_next) { 834 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
829 if (!buf) { 835 if (!buf) {
830 done += sizeof(ifr); 836 done += sizeof(ifr);
831 continue; 837 continue;
@@ -875,36 +881,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
875 if (!addr) 881 if (!addr)
876 addr = ifa->ifa_local; 882 addr = ifa->ifa_local;
877 } endfor_ifa(in_dev); 883 } endfor_ifa(in_dev);
878no_in_dev:
879 rcu_read_unlock();
880 884
881 if (addr) 885 if (addr)
882 goto out; 886 goto out_unlock;
887no_in_dev:
883 888
884 /* Not loopback addresses on loopback should be preferred 889 /* Not loopback addresses on loopback should be preferred
885 in this case. It is importnat that lo is the first interface 890 in this case. It is importnat that lo is the first interface
886 in dev_base list. 891 in dev_base list.
887 */ 892 */
888 read_lock(&dev_base_lock); 893 for_each_netdev_rcu(net, dev) {
889 rcu_read_lock(); 894 in_dev = __in_dev_get_rcu(dev);
890 for_each_netdev(net, dev) { 895 if (!in_dev)
891 if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
892 continue; 896 continue;
893 897
894 for_primary_ifa(in_dev) { 898 for_primary_ifa(in_dev) {
895 if (ifa->ifa_scope != RT_SCOPE_LINK && 899 if (ifa->ifa_scope != RT_SCOPE_LINK &&
896 ifa->ifa_scope <= scope) { 900 ifa->ifa_scope <= scope) {
897 addr = ifa->ifa_local; 901 addr = ifa->ifa_local;
898 goto out_unlock_both; 902 goto out_unlock;
899 } 903 }
900 } endfor_ifa(in_dev); 904 } endfor_ifa(in_dev);
901 } 905 }
902out_unlock_both: 906out_unlock:
903 read_unlock(&dev_base_lock);
904 rcu_read_unlock(); 907 rcu_read_unlock();
905out:
906 return addr; 908 return addr;
907} 909}
910EXPORT_SYMBOL(inet_select_addr);
908 911
909static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, 912static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
910 __be32 local, int scope) 913 __be32 local, int scope)
@@ -940,7 +943,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
940 } 943 }
941 } endfor_ifa(in_dev); 944 } endfor_ifa(in_dev);
942 945
943 return same? addr : 0; 946 return same ? addr : 0;
944} 947}
945 948
946/* 949/*
@@ -961,17 +964,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
961 return confirm_addr_indev(in_dev, dst, local, scope); 964 return confirm_addr_indev(in_dev, dst, local, scope);
962 965
963 net = dev_net(in_dev->dev); 966 net = dev_net(in_dev->dev);
964 read_lock(&dev_base_lock);
965 rcu_read_lock(); 967 rcu_read_lock();
966 for_each_netdev(net, dev) { 968 for_each_netdev_rcu(net, dev) {
967 if ((in_dev = __in_dev_get_rcu(dev))) { 969 in_dev = __in_dev_get_rcu(dev);
970 if (in_dev) {
968 addr = confirm_addr_indev(in_dev, dst, local, scope); 971 addr = confirm_addr_indev(in_dev, dst, local, scope);
969 if (addr) 972 if (addr)
970 break; 973 break;
971 } 974 }
972 } 975 }
973 rcu_read_unlock(); 976 rcu_read_unlock();
974 read_unlock(&dev_base_lock);
975 977
976 return addr; 978 return addr;
977} 979}
@@ -984,14 +986,16 @@ int register_inetaddr_notifier(struct notifier_block *nb)
984{ 986{
985 return blocking_notifier_chain_register(&inetaddr_chain, nb); 987 return blocking_notifier_chain_register(&inetaddr_chain, nb);
986} 988}
989EXPORT_SYMBOL(register_inetaddr_notifier);
987 990
988int unregister_inetaddr_notifier(struct notifier_block *nb) 991int unregister_inetaddr_notifier(struct notifier_block *nb)
989{ 992{
990 return blocking_notifier_chain_unregister(&inetaddr_chain, nb); 993 return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
991} 994}
995EXPORT_SYMBOL(unregister_inetaddr_notifier);
992 996
993/* Rename ifa_labels for a device name change. Make some effort to preserve existing 997/* Rename ifa_labels for a device name change. Make some effort to preserve
994 * alias numbering and to create unique labels if possible. 998 * existing alias numbering and to create unique labels if possible.
995*/ 999*/
996static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) 1000static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
997{ 1001{
@@ -1010,11 +1014,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
1010 sprintf(old, ":%d", named); 1014 sprintf(old, ":%d", named);
1011 dot = old; 1015 dot = old;
1012 } 1016 }
1013 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { 1017 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
1014 strcat(ifa->ifa_label, dot); 1018 strcat(ifa->ifa_label, dot);
1015 } else { 1019 else
1016 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); 1020 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
1017 }
1018skip: 1021skip:
1019 rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); 1022 rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
1020 } 1023 }
@@ -1061,8 +1064,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1061 if (!inetdev_valid_mtu(dev->mtu)) 1064 if (!inetdev_valid_mtu(dev->mtu))
1062 break; 1065 break;
1063 if (dev->flags & IFF_LOOPBACK) { 1066 if (dev->flags & IFF_LOOPBACK) {
1064 struct in_ifaddr *ifa; 1067 struct in_ifaddr *ifa = inet_alloc_ifa();
1065 if ((ifa = inet_alloc_ifa()) != NULL) { 1068
1069 if (ifa) {
1066 ifa->ifa_local = 1070 ifa->ifa_local =
1067 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1071 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1068 ifa->ifa_prefixlen = 8; 1072 ifa->ifa_prefixlen = 8;
@@ -1170,38 +1174,54 @@ nla_put_failure:
1170static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 1174static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1171{ 1175{
1172 struct net *net = sock_net(skb->sk); 1176 struct net *net = sock_net(skb->sk);
1173 int idx, ip_idx; 1177 int h, s_h;
1178 int idx, s_idx;
1179 int ip_idx, s_ip_idx;
1174 struct net_device *dev; 1180 struct net_device *dev;
1175 struct in_device *in_dev; 1181 struct in_device *in_dev;
1176 struct in_ifaddr *ifa; 1182 struct in_ifaddr *ifa;
1177 int s_ip_idx, s_idx = cb->args[0]; 1183 struct hlist_head *head;
1184 struct hlist_node *node;
1178 1185
1179 s_ip_idx = ip_idx = cb->args[1]; 1186 s_h = cb->args[0];
1180 idx = 0; 1187 s_idx = idx = cb->args[1];
1181 for_each_netdev(net, dev) { 1188 s_ip_idx = ip_idx = cb->args[2];
1182 if (idx < s_idx) 1189
1183 goto cont; 1190 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1184 if (idx > s_idx) 1191 idx = 0;
1185 s_ip_idx = 0; 1192 head = &net->dev_index_head[h];
1186 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) 1193 rcu_read_lock();
1187 goto cont; 1194 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
1188 1195 if (idx < s_idx)
1189 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; 1196 goto cont;
1190 ifa = ifa->ifa_next, ip_idx++) { 1197 if (idx > s_idx)
1191 if (ip_idx < s_ip_idx) 1198 s_ip_idx = 0;
1192 continue; 1199 in_dev = __in_dev_get_rcu(dev);
1193 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1200 if (!in_dev)
1201 goto cont;
1202
1203 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
1204 ifa = ifa->ifa_next, ip_idx++) {
1205 if (ip_idx < s_ip_idx)
1206 continue;
1207 if (inet_fill_ifaddr(skb, ifa,
1208 NETLINK_CB(cb->skb).pid,
1194 cb->nlh->nlmsg_seq, 1209 cb->nlh->nlmsg_seq,
1195 RTM_NEWADDR, NLM_F_MULTI) <= 0) 1210 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1196 goto done; 1211 rcu_read_unlock();
1197 } 1212 goto done;
1213 }
1214 }
1198cont: 1215cont:
1199 idx++; 1216 idx++;
1217 }
1218 rcu_read_unlock();
1200 } 1219 }
1201 1220
1202done: 1221done:
1203 cb->args[0] = idx; 1222 cb->args[0] = h;
1204 cb->args[1] = ip_idx; 1223 cb->args[1] = idx;
1224 cb->args[2] = ip_idx;
1205 1225
1206 return skb->len; 1226 return skb->len;
1207} 1227}
@@ -1239,18 +1259,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i)
1239{ 1259{
1240 struct net_device *dev; 1260 struct net_device *dev;
1241 1261
1242 read_lock(&dev_base_lock); 1262 rcu_read_lock();
1243 for_each_netdev(net, dev) { 1263 for_each_netdev_rcu(net, dev) {
1244 struct in_device *in_dev; 1264 struct in_device *in_dev;
1245 rcu_read_lock(); 1265
1246 in_dev = __in_dev_get_rcu(dev); 1266 in_dev = __in_dev_get_rcu(dev);
1247 if (in_dev && !test_bit(i, in_dev->cnf.state)) 1267 if (in_dev && !test_bit(i, in_dev->cnf.state))
1248 in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; 1268 in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
1249 rcu_read_unlock();
1250 } 1269 }
1251 read_unlock(&dev_base_lock); 1270 rcu_read_unlock();
1252} 1271}
1253 1272
1273/* called with RTNL locked */
1254static void inet_forward_change(struct net *net) 1274static void inet_forward_change(struct net *net)
1255{ 1275{
1256 struct net_device *dev; 1276 struct net_device *dev;
@@ -1259,7 +1279,6 @@ static void inet_forward_change(struct net *net)
1259 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; 1279 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
1260 IPV4_DEVCONF_DFLT(net, FORWARDING) = on; 1280 IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
1261 1281
1262 read_lock(&dev_base_lock);
1263 for_each_netdev(net, dev) { 1282 for_each_netdev(net, dev) {
1264 struct in_device *in_dev; 1283 struct in_device *in_dev;
1265 if (on) 1284 if (on)
@@ -1270,7 +1289,6 @@ static void inet_forward_change(struct net *net)
1270 IN_DEV_CONF_SET(in_dev, FORWARDING, on); 1289 IN_DEV_CONF_SET(in_dev, FORWARDING, on);
1271 rcu_read_unlock(); 1290 rcu_read_unlock();
1272 } 1291 }
1273 read_unlock(&dev_base_lock);
1274} 1292}
1275 1293
1276static int devinet_conf_proc(ctl_table *ctl, int write, 1294static int devinet_conf_proc(ctl_table *ctl, int write,
@@ -1378,6 +1396,7 @@ static struct devinet_sysctl_table {
1378 DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), 1396 DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
1379 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, 1397 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
1380 "accept_source_route"), 1398 "accept_source_route"),
1399 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
1381 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), 1400 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
1382 DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), 1401 DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
1383 DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), 1402 DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
@@ -1512,7 +1531,7 @@ static __net_init int devinet_init_net(struct net *net)
1512 all = &ipv4_devconf; 1531 all = &ipv4_devconf;
1513 dflt = &ipv4_devconf_dflt; 1532 dflt = &ipv4_devconf_dflt;
1514 1533
1515 if (net != &init_net) { 1534 if (!net_eq(net, &init_net)) {
1516 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); 1535 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
1517 if (all == NULL) 1536 if (all == NULL)
1518 goto err_alloc_all; 1537 goto err_alloc_all;
@@ -1603,8 +1622,3 @@ void __init devinet_init(void)
1603 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); 1622 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
1604} 1623}
1605 1624
1606EXPORT_SYMBOL(in_dev_finish_destroy);
1607EXPORT_SYMBOL(inet_select_addr);
1608EXPORT_SYMBOL(inetdev_by_index);
1609EXPORT_SYMBOL(register_inetaddr_notifier);
1610EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 12f7287e902d..1948895beb6d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x)
530 } 530 }
531 531
532 err = crypto_aead_setauthsize( 532 err = crypto_aead_setauthsize(
533 aead, aalg_desc->uinfo.auth.icv_truncbits / 8); 533 aead, x->aalg->alg_trunc_len / 8);
534 if (err) 534 if (err)
535 goto free_key; 535 goto free_key;
536 } 536 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index aa00398be80e..3323168ee52d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -125,7 +125,7 @@ void fib_select_default(struct net *net,
125#endif 125#endif
126 tb = fib_get_table(net, table); 126 tb = fib_get_table(net, table);
127 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 127 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
128 tb->tb_select_default(tb, flp, res); 128 fib_table_select_default(tb, flp, res);
129} 129}
130 130
131static void fib_flush(struct net *net) 131static void fib_flush(struct net *net)
@@ -139,7 +139,7 @@ static void fib_flush(struct net *net)
139 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 139 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 head = &net->ipv4.fib_table_hash[h]; 140 head = &net->ipv4.fib_table_hash[h];
141 hlist_for_each_entry(tb, node, head, tb_hlist) 141 hlist_for_each_entry(tb, node, head, tb_hlist)
142 flushed += tb->tb_flush(tb); 142 flushed += fib_table_flush(tb);
143 } 143 }
144 144
145 if (flushed) 145 if (flushed)
@@ -162,7 +162,7 @@ struct net_device * ip_dev_find(struct net *net, __be32 addr)
162#endif 162#endif
163 163
164 local_table = fib_get_table(net, RT_TABLE_LOCAL); 164 local_table = fib_get_table(net, RT_TABLE_LOCAL);
165 if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) 165 if (!local_table || fib_table_lookup(local_table, &fl, &res))
166 return NULL; 166 return NULL;
167 if (res.type != RTN_LOCAL) 167 if (res.type != RTN_LOCAL)
168 goto out; 168 goto out;
@@ -200,7 +200,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
200 local_table = fib_get_table(net, RT_TABLE_LOCAL); 200 local_table = fib_get_table(net, RT_TABLE_LOCAL);
201 if (local_table) { 201 if (local_table) {
202 ret = RTN_UNICAST; 202 ret = RTN_UNICAST;
203 if (!local_table->tb_lookup(local_table, &fl, &res)) { 203 if (!fib_table_lookup(local_table, &fl, &res)) {
204 if (!dev || dev == res.fi->fib_dev) 204 if (!dev || dev == res.fi->fib_dev)
205 ret = res.type; 205 ret = res.type;
206 fib_res_put(&res); 206 fib_res_put(&res);
@@ -241,16 +241,17 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
241 .iif = oif }; 241 .iif = oif };
242 242
243 struct fib_result res; 243 struct fib_result res;
244 int no_addr, rpf; 244 int no_addr, rpf, accept_local;
245 int ret; 245 int ret;
246 struct net *net; 246 struct net *net;
247 247
248 no_addr = rpf = 0; 248 no_addr = rpf = accept_local = 0;
249 rcu_read_lock(); 249 rcu_read_lock();
250 in_dev = __in_dev_get_rcu(dev); 250 in_dev = __in_dev_get_rcu(dev);
251 if (in_dev) { 251 if (in_dev) {
252 no_addr = in_dev->ifa_list == NULL; 252 no_addr = in_dev->ifa_list == NULL;
253 rpf = IN_DEV_RPFILTER(in_dev); 253 rpf = IN_DEV_RPFILTER(in_dev);
254 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
254 } 255 }
255 rcu_read_unlock(); 256 rcu_read_unlock();
256 257
@@ -260,8 +261,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
260 net = dev_net(dev); 261 net = dev_net(dev);
261 if (fib_lookup(net, &fl, &res)) 262 if (fib_lookup(net, &fl, &res))
262 goto last_resort; 263 goto last_resort;
263 if (res.type != RTN_UNICAST) 264 if (res.type != RTN_UNICAST) {
264 goto e_inval_res; 265 if (res.type != RTN_LOCAL || !accept_local)
266 goto e_inval_res;
267 }
265 *spec_dst = FIB_RES_PREFSRC(res); 268 *spec_dst = FIB_RES_PREFSRC(res);
266 fib_combine_itag(itag, &res); 269 fib_combine_itag(itag, &res);
267#ifdef CONFIG_IP_ROUTE_MULTIPATH 270#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -476,13 +479,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
476 if (cmd == SIOCDELRT) { 479 if (cmd == SIOCDELRT) {
477 tb = fib_get_table(net, cfg.fc_table); 480 tb = fib_get_table(net, cfg.fc_table);
478 if (tb) 481 if (tb)
479 err = tb->tb_delete(tb, &cfg); 482 err = fib_table_delete(tb, &cfg);
480 else 483 else
481 err = -ESRCH; 484 err = -ESRCH;
482 } else { 485 } else {
483 tb = fib_new_table(net, cfg.fc_table); 486 tb = fib_new_table(net, cfg.fc_table);
484 if (tb) 487 if (tb)
485 err = tb->tb_insert(tb, &cfg); 488 err = fib_table_insert(tb, &cfg);
486 else 489 else
487 err = -ENOBUFS; 490 err = -ENOBUFS;
488 } 491 }
@@ -597,7 +600,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
597 goto errout; 600 goto errout;
598 } 601 }
599 602
600 err = tb->tb_delete(tb, &cfg); 603 err = fib_table_delete(tb, &cfg);
601errout: 604errout:
602 return err; 605 return err;
603} 606}
@@ -619,7 +622,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
619 goto errout; 622 goto errout;
620 } 623 }
621 624
622 err = tb->tb_insert(tb, &cfg); 625 err = fib_table_insert(tb, &cfg);
623errout: 626errout:
624 return err; 627 return err;
625} 628}
@@ -650,7 +653,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
650 if (dumped) 653 if (dumped)
651 memset(&cb->args[2], 0, sizeof(cb->args) - 654 memset(&cb->args[2], 0, sizeof(cb->args) -
652 2 * sizeof(cb->args[0])); 655 2 * sizeof(cb->args[0]));
653 if (tb->tb_dump(tb, skb, cb) < 0) 656 if (fib_table_dump(tb, skb, cb) < 0)
654 goto out; 657 goto out;
655 dumped = 1; 658 dumped = 1;
656next: 659next:
@@ -704,9 +707,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
704 cfg.fc_scope = RT_SCOPE_HOST; 707 cfg.fc_scope = RT_SCOPE_HOST;
705 708
706 if (cmd == RTM_NEWROUTE) 709 if (cmd == RTM_NEWROUTE)
707 tb->tb_insert(tb, &cfg); 710 fib_table_insert(tb, &cfg);
708 else 711 else
709 tb->tb_delete(tb, &cfg); 712 fib_table_delete(tb, &cfg);
710} 713}
711 714
712void fib_add_ifaddr(struct in_ifaddr *ifa) 715void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -835,7 +838,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
835 local_bh_disable(); 838 local_bh_disable();
836 839
837 frn->tb_id = tb->tb_id; 840 frn->tb_id = tb->tb_id;
838 frn->err = tb->tb_lookup(tb, &fl, &res); 841 frn->err = fib_table_lookup(tb, &fl, &res);
839 842
840 if (!frn->err) { 843 if (!frn->err) {
841 frn->prefixlen = res.prefixlen; 844 frn->prefixlen = res.prefixlen;
@@ -895,11 +898,11 @@ static void nl_fib_lookup_exit(struct net *net)
895 net->ipv4.fibnl = NULL; 898 net->ipv4.fibnl = NULL;
896} 899}
897 900
898static void fib_disable_ip(struct net_device *dev, int force) 901static void fib_disable_ip(struct net_device *dev, int force, int delay)
899{ 902{
900 if (fib_sync_down_dev(dev, force)) 903 if (fib_sync_down_dev(dev, force))
901 fib_flush(dev_net(dev)); 904 fib_flush(dev_net(dev));
902 rt_cache_flush(dev_net(dev), 0); 905 rt_cache_flush(dev_net(dev), delay);
903 arp_ifdown(dev); 906 arp_ifdown(dev);
904} 907}
905 908
@@ -922,7 +925,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
922 /* Last address was deleted from this interface. 925 /* Last address was deleted from this interface.
923 Disable IP. 926 Disable IP.
924 */ 927 */
925 fib_disable_ip(dev, 1); 928 fib_disable_ip(dev, 1, 0);
926 } else { 929 } else {
927 rt_cache_flush(dev_net(dev), -1); 930 rt_cache_flush(dev_net(dev), -1);
928 } 931 }
@@ -937,7 +940,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
937 struct in_device *in_dev = __in_dev_get_rtnl(dev); 940 struct in_device *in_dev = __in_dev_get_rtnl(dev);
938 941
939 if (event == NETDEV_UNREGISTER) { 942 if (event == NETDEV_UNREGISTER) {
940 fib_disable_ip(dev, 2); 943 fib_disable_ip(dev, 2, -1);
941 return NOTIFY_DONE; 944 return NOTIFY_DONE;
942 } 945 }
943 946
@@ -955,12 +958,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
955 rt_cache_flush(dev_net(dev), -1); 958 rt_cache_flush(dev_net(dev), -1);
956 break; 959 break;
957 case NETDEV_DOWN: 960 case NETDEV_DOWN:
958 fib_disable_ip(dev, 0); 961 fib_disable_ip(dev, 0, 0);
959 break; 962 break;
960 case NETDEV_CHANGEMTU: 963 case NETDEV_CHANGEMTU:
961 case NETDEV_CHANGE: 964 case NETDEV_CHANGE:
962 rt_cache_flush(dev_net(dev), 0); 965 rt_cache_flush(dev_net(dev), 0);
963 break; 966 break;
967 case NETDEV_UNREGISTER_BATCH:
968 rt_cache_flush_batch();
969 break;
964 } 970 }
965 return NOTIFY_DONE; 971 return NOTIFY_DONE;
966} 972}
@@ -1012,7 +1018,7 @@ static void __net_exit ip_fib_net_exit(struct net *net)
1012 head = &net->ipv4.fib_table_hash[i]; 1018 head = &net->ipv4.fib_table_hash[i];
1013 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { 1019 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1014 hlist_del(node); 1020 hlist_del(node);
1015 tb->tb_flush(tb); 1021 fib_table_flush(tb);
1016 kfree(tb); 1022 kfree(tb);
1017 } 1023 }
1018 } 1024 }
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ecd39454235c..14972017b9c2 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -242,8 +242,8 @@ fn_new_zone(struct fn_hash *table, int z)
242 return fz; 242 return fz;
243} 243}
244 244
245static int 245int fib_table_lookup(struct fib_table *tb,
246fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 246 const struct flowi *flp, struct fib_result *res)
247{ 247{
248 int err; 248 int err;
249 struct fn_zone *fz; 249 struct fn_zone *fz;
@@ -274,8 +274,8 @@ out:
274 return err; 274 return err;
275} 275}
276 276
277static void 277void fib_table_select_default(struct fib_table *tb,
278fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 278 const struct flowi *flp, struct fib_result *res)
279{ 279{
280 int order, last_idx; 280 int order, last_idx;
281 struct hlist_node *node; 281 struct hlist_node *node;
@@ -366,7 +366,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
366 return NULL; 366 return NULL;
367} 367}
368 368
369static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) 369int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
370{ 370{
371 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 371 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
372 struct fib_node *new_f = NULL; 372 struct fib_node *new_f = NULL;
@@ -544,8 +544,7 @@ out:
544 return err; 544 return err;
545} 545}
546 546
547 547int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
548static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
549{ 548{
550 struct fn_hash *table = (struct fn_hash *)tb->tb_data; 549 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
551 struct fib_node *f; 550 struct fib_node *f;
@@ -662,7 +661,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
662 return found; 661 return found;
663} 662}
664 663
665static int fn_hash_flush(struct fib_table *tb) 664int fib_table_flush(struct fib_table *tb)
666{ 665{
667 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 666 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
668 struct fn_zone *fz; 667 struct fn_zone *fz;
@@ -743,7 +742,8 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
743 return skb->len; 742 return skb->len;
744} 743}
745 744
746static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) 745int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
746 struct netlink_callback *cb)
747{ 747{
748 int m, s_m; 748 int m, s_m;
749 struct fn_zone *fz; 749 struct fn_zone *fz;
@@ -787,12 +787,7 @@ struct fib_table *fib_hash_table(u32 id)
787 787
788 tb->tb_id = id; 788 tb->tb_id = id;
789 tb->tb_default = -1; 789 tb->tb_default = -1;
790 tb->tb_lookup = fn_hash_lookup; 790
791 tb->tb_insert = fn_hash_insert;
792 tb->tb_delete = fn_hash_delete;
793 tb->tb_flush = fn_hash_flush;
794 tb->tb_select_default = fn_hash_select_default;
795 tb->tb_dump = fn_hash_dump;
796 memset(tb->tb_data, 0, sizeof(struct fn_hash)); 791 memset(tb->tb_data, 0, sizeof(struct fn_hash));
797 return tb; 792 return tb;
798} 793}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 92d9d97ec5e3..ca2d07b1c706 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -94,7 +94,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) 94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
95 goto errout; 95 goto errout;
96 96
97 err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); 97 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
98 if (err > 0) 98 if (err > 0)
99 err = -EAGAIN; 99 err = -EAGAIN;
100errout: 100errout:
@@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
284{ 284{
285 int err; 285 int err;
286 286
287 err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); 287 err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
288 if (err < 0) 288 if (err < 0)
289 return err; 289 return err;
290 err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); 290 err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
@@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net)
301 int err; 301 int err;
302 struct fib_rules_ops *ops; 302 struct fib_rules_ops *ops;
303 303
304 ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); 304 ops = fib_rules_register(&fib4_rules_ops_template, net);
305 if (ops == NULL) 305 if (IS_ERR(ops))
306 return -ENOMEM; 306 return PTR_ERR(ops);
307 INIT_LIST_HEAD(&ops->rules_list);
308 ops->fro_net = net;
309
310 fib_rules_register(ops);
311 307
312 err = fib_default_rules_init(ops); 308 err = fib_default_rules_init(ops);
313 if (err < 0) 309 if (err < 0)
@@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net)
318fail: 314fail:
319 /* also cleans all rules already added */ 315 /* also cleans all rules already added */
320 fib_rules_unregister(ops); 316 fib_rules_unregister(ops);
321 kfree(ops);
322 return err; 317 return err;
323} 318}
324 319
325void __net_exit fib4_rules_exit(struct net *net) 320void __net_exit fib4_rules_exit(struct net *net)
326{ 321{
327 fib_rules_unregister(net->ipv4.rules_ops); 322 fib_rules_unregister(net->ipv4.rules_ops);
328 kfree(net->ipv4.rules_ops);
329} 323}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9b096d6ff3f2..ed19aa6919c2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -228,7 +228,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
228 head = &fib_info_hash[hash]; 228 head = &fib_info_hash[hash];
229 229
230 hlist_for_each_entry(fi, node, head, fib_hash) { 230 hlist_for_each_entry(fi, node, head, fib_hash) {
231 if (fi->fib_net != nfi->fib_net) 231 if (!net_eq(fi->fib_net, nfi->fib_net))
232 continue; 232 continue;
233 if (fi->fib_nhs != nfi->fib_nhs) 233 if (fi->fib_nhs != nfi->fib_nhs)
234 continue; 234 continue;
@@ -1047,7 +1047,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
1047 return 0; 1047 return 0;
1048 1048
1049 hlist_for_each_entry(fi, node, head, fib_lhash) { 1049 hlist_for_each_entry(fi, node, head, fib_lhash) {
1050 if (fi->fib_net != net) 1050 if (!net_eq(fi->fib_net, net))
1051 continue; 1051 continue;
1052 if (fi->fib_prefsrc == local) { 1052 if (fi->fib_prefsrc == local) {
1053 fi->fib_flags |= RTNH_F_DEAD; 1053 fi->fib_flags |= RTNH_F_DEAD;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 291bdf50a21f..af5d89792860 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1174,7 +1174,7 @@ done:
1174/* 1174/*
1175 * Caller must hold RTNL. 1175 * Caller must hold RTNL.
1176 */ 1176 */
1177static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) 1177int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1178{ 1178{
1179 struct trie *t = (struct trie *) tb->tb_data; 1179 struct trie *t = (struct trie *) tb->tb_data;
1180 struct fib_alias *fa, *new_fa; 1180 struct fib_alias *fa, *new_fa;
@@ -1373,8 +1373,8 @@ static int check_leaf(struct trie *t, struct leaf *l,
1373 return 1; 1373 return 1;
1374} 1374}
1375 1375
1376static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, 1376int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1377 struct fib_result *res) 1377 struct fib_result *res)
1378{ 1378{
1379 struct trie *t = (struct trie *) tb->tb_data; 1379 struct trie *t = (struct trie *) tb->tb_data;
1380 int ret; 1380 int ret;
@@ -1595,7 +1595,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1595/* 1595/*
1596 * Caller must hold RTNL. 1596 * Caller must hold RTNL.
1597 */ 1597 */
1598static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) 1598int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1599{ 1599{
1600 struct trie *t = (struct trie *) tb->tb_data; 1600 struct trie *t = (struct trie *) tb->tb_data;
1601 u32 key, mask; 1601 u32 key, mask;
@@ -1786,7 +1786,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
1786/* 1786/*
1787 * Caller must hold RTNL. 1787 * Caller must hold RTNL.
1788 */ 1788 */
1789static int fn_trie_flush(struct fib_table *tb) 1789int fib_table_flush(struct fib_table *tb)
1790{ 1790{
1791 struct trie *t = (struct trie *) tb->tb_data; 1791 struct trie *t = (struct trie *) tb->tb_data;
1792 struct leaf *l, *ll = NULL; 1792 struct leaf *l, *ll = NULL;
@@ -1807,9 +1807,9 @@ static int fn_trie_flush(struct fib_table *tb)
1807 return found; 1807 return found;
1808} 1808}
1809 1809
1810static void fn_trie_select_default(struct fib_table *tb, 1810void fib_table_select_default(struct fib_table *tb,
1811 const struct flowi *flp, 1811 const struct flowi *flp,
1812 struct fib_result *res) 1812 struct fib_result *res)
1813{ 1813{
1814 struct trie *t = (struct trie *) tb->tb_data; 1814 struct trie *t = (struct trie *) tb->tb_data;
1815 int order, last_idx; 1815 int order, last_idx;
@@ -1952,8 +1952,8 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1952 return skb->len; 1952 return skb->len;
1953} 1953}
1954 1954
1955static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, 1955int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1956 struct netlink_callback *cb) 1956 struct netlink_callback *cb)
1957{ 1957{
1958 struct leaf *l; 1958 struct leaf *l;
1959 struct trie *t = (struct trie *) tb->tb_data; 1959 struct trie *t = (struct trie *) tb->tb_data;
@@ -2020,12 +2020,6 @@ struct fib_table *fib_hash_table(u32 id)
2020 2020
2021 tb->tb_id = id; 2021 tb->tb_id = id;
2022 tb->tb_default = -1; 2022 tb->tb_default = -1;
2023 tb->tb_lookup = fn_trie_lookup;
2024 tb->tb_insert = fn_trie_insert;
2025 tb->tb_delete = fn_trie_delete;
2026 tb->tb_flush = fn_trie_flush;
2027 tb->tb_select_default = fn_trie_select_default;
2028 tb->tb_dump = fn_trie_dump;
2029 2023
2030 t = (struct trie *) tb->tb_data; 2024 t = (struct trie *) tb->tb_data;
2031 memset(t, 0, sizeof(*t)); 2025 memset(t, 0, sizeof(*t));
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5bc13fe816d1..fe11f60ce41b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -501,15 +501,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
501 if (!(rt->rt_flags & RTCF_LOCAL)) { 501 if (!(rt->rt_flags & RTCF_LOCAL)) {
502 struct net_device *dev = NULL; 502 struct net_device *dev = NULL;
503 503
504 rcu_read_lock();
504 if (rt->fl.iif && 505 if (rt->fl.iif &&
505 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 506 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
506 dev = dev_get_by_index(net, rt->fl.iif); 507 dev = dev_get_by_index_rcu(net, rt->fl.iif);
507 508
508 if (dev) { 509 if (dev)
509 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 510 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
510 dev_put(dev); 511 else
511 } else
512 saddr = 0; 512 saddr = 0;
513 rcu_read_unlock();
513 } 514 }
514 515
515 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 516 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
@@ -1165,6 +1166,10 @@ static int __net_init icmp_sk_init(struct net *net)
1165 sk->sk_sndbuf = 1166 sk->sk_sndbuf =
1166 (2 * ((64 * 1024) + sizeof(struct sk_buff))); 1167 (2 * ((64 * 1024) + sizeof(struct sk_buff)));
1167 1168
1169 /*
1170 * Speedup sock_wfree()
1171 */
1172 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1168 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; 1173 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
1169 } 1174 }
1170 1175
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d41e5de79a82..76c08402c933 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1899,8 +1899,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1899 err = -EADDRNOTAVAIL; 1899 err = -EADDRNOTAVAIL;
1900 1900
1901 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1901 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1902 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr 1902 if ((pmc->multi.imr_multiaddr.s_addr ==
1903 && pmc->multi.imr_ifindex == imr.imr_ifindex) 1903 imr.imr_multiaddr.s_addr) &&
1904 (pmc->multi.imr_ifindex == imr.imr_ifindex))
1904 break; 1905 break;
1905 } 1906 }
1906 if (!pmc) { /* must have a prior join */ 1907 if (!pmc) { /* must have a prior join */
@@ -2311,9 +2312,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2311 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2312 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2312 2313
2313 state->in_dev = NULL; 2314 state->in_dev = NULL;
2314 for_each_netdev(net, state->dev) { 2315 for_each_netdev_rcu(net, state->dev) {
2315 struct in_device *in_dev; 2316 struct in_device *in_dev;
2316 in_dev = in_dev_get(state->dev); 2317
2318 in_dev = __in_dev_get_rcu(state->dev);
2317 if (!in_dev) 2319 if (!in_dev)
2318 continue; 2320 continue;
2319 read_lock(&in_dev->mc_list_lock); 2321 read_lock(&in_dev->mc_list_lock);
@@ -2323,7 +2325,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2323 break; 2325 break;
2324 } 2326 }
2325 read_unlock(&in_dev->mc_list_lock); 2327 read_unlock(&in_dev->mc_list_lock);
2326 in_dev_put(in_dev);
2327 } 2328 }
2328 return im; 2329 return im;
2329} 2330}
@@ -2333,16 +2334,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
2333 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2334 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2334 im = im->next; 2335 im = im->next;
2335 while (!im) { 2336 while (!im) {
2336 if (likely(state->in_dev != NULL)) { 2337 if (likely(state->in_dev != NULL))
2337 read_unlock(&state->in_dev->mc_list_lock); 2338 read_unlock(&state->in_dev->mc_list_lock);
2338 in_dev_put(state->in_dev); 2339
2339 } 2340 state->dev = next_net_device_rcu(state->dev);
2340 state->dev = next_net_device(state->dev);
2341 if (!state->dev) { 2341 if (!state->dev) {
2342 state->in_dev = NULL; 2342 state->in_dev = NULL;
2343 break; 2343 break;
2344 } 2344 }
2345 state->in_dev = in_dev_get(state->dev); 2345 state->in_dev = __in_dev_get_rcu(state->dev);
2346 if (!state->in_dev) 2346 if (!state->in_dev)
2347 continue; 2347 continue;
2348 read_lock(&state->in_dev->mc_list_lock); 2348 read_lock(&state->in_dev->mc_list_lock);
@@ -2361,9 +2361,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
2361} 2361}
2362 2362
2363static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) 2363static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
2364 __acquires(dev_base_lock) 2364 __acquires(rcu)
2365{ 2365{
2366 read_lock(&dev_base_lock); 2366 rcu_read_lock();
2367 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2367 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2368} 2368}
2369 2369
@@ -2379,16 +2379,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2379} 2379}
2380 2380
2381static void igmp_mc_seq_stop(struct seq_file *seq, void *v) 2381static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2382 __releases(dev_base_lock) 2382 __releases(rcu)
2383{ 2383{
2384 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2384 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2385 if (likely(state->in_dev != NULL)) { 2385 if (likely(state->in_dev != NULL)) {
2386 read_unlock(&state->in_dev->mc_list_lock); 2386 read_unlock(&state->in_dev->mc_list_lock);
2387 in_dev_put(state->in_dev);
2388 state->in_dev = NULL; 2387 state->in_dev = NULL;
2389 } 2388 }
2390 state->dev = NULL; 2389 state->dev = NULL;
2391 read_unlock(&dev_base_lock); 2390 rcu_read_unlock();
2392} 2391}
2393 2392
2394static int igmp_mc_seq_show(struct seq_file *seq, void *v) 2393static int igmp_mc_seq_show(struct seq_file *seq, void *v)
@@ -2462,9 +2461,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2462 2461
2463 state->idev = NULL; 2462 state->idev = NULL;
2464 state->im = NULL; 2463 state->im = NULL;
2465 for_each_netdev(net, state->dev) { 2464 for_each_netdev_rcu(net, state->dev) {
2466 struct in_device *idev; 2465 struct in_device *idev;
2467 idev = in_dev_get(state->dev); 2466 idev = __in_dev_get_rcu(state->dev);
2468 if (unlikely(idev == NULL)) 2467 if (unlikely(idev == NULL))
2469 continue; 2468 continue;
2470 read_lock(&idev->mc_list_lock); 2469 read_lock(&idev->mc_list_lock);
@@ -2480,7 +2479,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2480 spin_unlock_bh(&im->lock); 2479 spin_unlock_bh(&im->lock);
2481 } 2480 }
2482 read_unlock(&idev->mc_list_lock); 2481 read_unlock(&idev->mc_list_lock);
2483 in_dev_put(idev);
2484 } 2482 }
2485 return psf; 2483 return psf;
2486} 2484}
@@ -2494,16 +2492,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2494 spin_unlock_bh(&state->im->lock); 2492 spin_unlock_bh(&state->im->lock);
2495 state->im = state->im->next; 2493 state->im = state->im->next;
2496 while (!state->im) { 2494 while (!state->im) {
2497 if (likely(state->idev != NULL)) { 2495 if (likely(state->idev != NULL))
2498 read_unlock(&state->idev->mc_list_lock); 2496 read_unlock(&state->idev->mc_list_lock);
2499 in_dev_put(state->idev); 2497
2500 } 2498 state->dev = next_net_device_rcu(state->dev);
2501 state->dev = next_net_device(state->dev);
2502 if (!state->dev) { 2499 if (!state->dev) {
2503 state->idev = NULL; 2500 state->idev = NULL;
2504 goto out; 2501 goto out;
2505 } 2502 }
2506 state->idev = in_dev_get(state->dev); 2503 state->idev = __in_dev_get_rcu(state->dev);
2507 if (!state->idev) 2504 if (!state->idev)
2508 continue; 2505 continue;
2509 read_lock(&state->idev->mc_list_lock); 2506 read_lock(&state->idev->mc_list_lock);
@@ -2528,8 +2525,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
2528} 2525}
2529 2526
2530static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) 2527static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
2528 __acquires(rcu)
2531{ 2529{
2532 read_lock(&dev_base_lock); 2530 rcu_read_lock();
2533 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2531 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2534} 2532}
2535 2533
@@ -2545,6 +2543,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2545} 2543}
2546 2544
2547static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) 2545static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2546 __releases(rcu)
2548{ 2547{
2549 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); 2548 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2550 if (likely(state->im != NULL)) { 2549 if (likely(state->im != NULL)) {
@@ -2553,11 +2552,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2553 } 2552 }
2554 if (likely(state->idev != NULL)) { 2553 if (likely(state->idev != NULL)) {
2555 read_unlock(&state->idev->mc_list_lock); 2554 read_unlock(&state->idev->mc_list_lock);
2556 in_dev_put(state->idev);
2557 state->idev = NULL; 2555 state->idev = NULL;
2558 } 2556 }
2559 state->dev = NULL; 2557 state->dev = NULL;
2560 read_unlock(&dev_base_lock); 2558 rcu_read_unlock();
2561} 2559}
2562 2560
2563static int igmp_mcf_seq_show(struct seq_file *seq, void *v) 2561static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 537731b3bcb3..ee16475f8fc3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -112,7 +112,7 @@ again:
112 hashinfo->bhash_size)]; 112 hashinfo->bhash_size)];
113 spin_lock(&head->lock); 113 spin_lock(&head->lock);
114 inet_bind_bucket_for_each(tb, node, &head->chain) 114 inet_bind_bucket_for_each(tb, node, &head->chain)
115 if (ib_net(tb) == net && tb->port == rover) { 115 if (net_eq(ib_net(tb), net) && tb->port == rover) {
116 if (tb->fastreuse > 0 && 116 if (tb->fastreuse > 0 &&
117 sk->sk_reuse && 117 sk->sk_reuse &&
118 sk->sk_state != TCP_LISTEN && 118 sk->sk_state != TCP_LISTEN &&
@@ -158,7 +158,7 @@ have_snum:
158 hashinfo->bhash_size)]; 158 hashinfo->bhash_size)];
159 spin_lock(&head->lock); 159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain) 160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (ib_net(tb) == net && tb->port == snum) 161 if (net_eq(ib_net(tb), net) && tb->port == snum)
162 goto tb_found; 162 goto tb_found;
163 } 163 }
164 tb = NULL; 164 tb = NULL;
@@ -358,6 +358,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
358 const struct inet_request_sock *ireq = inet_rsk(req); 358 const struct inet_request_sock *ireq = inet_rsk(req);
359 struct ip_options *opt = inet_rsk(req)->opt; 359 struct ip_options *opt = inet_rsk(req)->opt;
360 struct flowi fl = { .oif = sk->sk_bound_dev_if, 360 struct flowi fl = { .oif = sk->sk_bound_dev_if,
361 .mark = sk->sk_mark,
361 .nl_u = { .ip4_u = 362 .nl_u = { .ip4_u =
362 { .daddr = ((opt && opt->srr) ? 363 { .daddr = ((opt && opt->srr) ?
363 opt->faddr : 364 opt->faddr :
@@ -367,7 +368,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
367 .proto = sk->sk_protocol, 368 .proto = sk->sk_protocol,
368 .flags = inet_sk_flowi_flags(sk), 369 .flags = inet_sk_flowi_flags(sk),
369 .uli_u = { .ports = 370 .uli_u = { .ports =
370 { .sport = inet_sk(sk)->sport, 371 { .sport = inet_sk(sk)->inet_sport,
371 .dport = ireq->rmt_port } } }; 372 .dport = ireq->rmt_port } } };
372 struct net *net = sock_net(sk); 373 struct net *net = sock_net(sk);
373 374
@@ -530,7 +531,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
530 &expire, &resend); 531 &expire, &resend);
531 if (!expire && 532 if (!expire &&
532 (!resend || 533 (!resend ||
533 !req->rsk_ops->rtx_syn_ack(parent, req) || 534 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
534 inet_rsk(req)->acked)) { 535 inet_rsk(req)->acked)) {
535 unsigned long timeo; 536 unsigned long timeo;
536 537
@@ -574,9 +575,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
574 newsk->sk_state = TCP_SYN_RECV; 575 newsk->sk_state = TCP_SYN_RECV;
575 newicsk->icsk_bind_hash = NULL; 576 newicsk->icsk_bind_hash = NULL;
576 577
577 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 578 inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
578 inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); 579 inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
579 inet_sk(newsk)->sport = inet_rsk(req)->loc_port; 580 inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
580 newsk->sk_write_space = sk_stream_write_space; 581 newsk->sk_write_space = sk_stream_write_space;
581 582
582 newicsk->icsk_retransmits = 0; 583 newicsk->icsk_retransmits = 0;
@@ -607,8 +608,8 @@ void inet_csk_destroy_sock(struct sock *sk)
607 /* It cannot be in hash table! */ 608 /* It cannot be in hash table! */
608 WARN_ON(!sk_unhashed(sk)); 609 WARN_ON(!sk_unhashed(sk));
609 610
610 /* If it has not 0 inet_sk(sk)->num, it must be bound */ 611 /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
611 WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash); 612 WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
612 613
613 sk->sk_prot->destroy(sk); 614 sk->sk_prot->destroy(sk);
614 615
@@ -643,8 +644,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
643 * after validation is complete. 644 * after validation is complete.
644 */ 645 */
645 sk->sk_state = TCP_LISTEN; 646 sk->sk_state = TCP_LISTEN;
646 if (!sk->sk_prot->get_port(sk, inet->num)) { 647 if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
647 inet->sport = htons(inet->num); 648 inet->inet_sport = htons(inet->inet_num);
648 649
649 sk_dst_reset(sk); 650 sk_dst_reset(sk);
650 sk->sk_prot->hash(sk); 651 sk->sk_prot->hash(sk);
@@ -720,8 +721,8 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
720 const struct inet_sock *inet = inet_sk(sk); 721 const struct inet_sock *inet = inet_sk(sk);
721 722
722 sin->sin_family = AF_INET; 723 sin->sin_family = AF_INET;
723 sin->sin_addr.s_addr = inet->daddr; 724 sin->sin_addr.s_addr = inet->inet_daddr;
724 sin->sin_port = inet->dport; 725 sin->sin_port = inet->inet_dport;
725} 726}
726 727
727EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 728EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a706a47f4dbb..bdb78dd180ce 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -116,10 +116,10 @@ static int inet_csk_diag_fill(struct sock *sk,
116 r->id.idiag_cookie[0] = (u32)(unsigned long)sk; 116 r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
117 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); 117 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
118 118
119 r->id.idiag_sport = inet->sport; 119 r->id.idiag_sport = inet->inet_sport;
120 r->id.idiag_dport = inet->dport; 120 r->id.idiag_dport = inet->inet_dport;
121 r->id.idiag_src[0] = inet->rcv_saddr; 121 r->id.idiag_src[0] = inet->inet_rcv_saddr;
122 r->id.idiag_dst[0] = inet->daddr; 122 r->id.idiag_dst[0] = inet->inet_daddr;
123 123
124#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 124#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
125 if (r->idiag_family == AF_INET6) { 125 if (r->idiag_family == AF_INET6) {
@@ -504,11 +504,11 @@ static int inet_csk_diag_dump(struct sock *sk,
504 } else 504 } else
505#endif 505#endif
506 { 506 {
507 entry.saddr = &inet->rcv_saddr; 507 entry.saddr = &inet->inet_rcv_saddr;
508 entry.daddr = &inet->daddr; 508 entry.daddr = &inet->inet_daddr;
509 } 509 }
510 entry.sport = inet->num; 510 entry.sport = inet->inet_num;
511 entry.dport = ntohs(inet->dport); 511 entry.dport = ntohs(inet->inet_dport);
512 entry.userlocks = sk->sk_userlocks; 512 entry.userlocks = sk->sk_userlocks;
513 513
514 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) 514 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
@@ -584,7 +584,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
584 if (tmo < 0) 584 if (tmo < 0)
585 tmo = 0; 585 tmo = 0;
586 586
587 r->id.idiag_sport = inet->sport; 587 r->id.idiag_sport = inet->inet_sport;
588 r->id.idiag_dport = ireq->rmt_port; 588 r->id.idiag_dport = ireq->rmt_port;
589 r->id.idiag_src[0] = ireq->loc_addr; 589 r->id.idiag_src[0] = ireq->loc_addr;
590 r->id.idiag_dst[0] = ireq->rmt_addr; 590 r->id.idiag_dst[0] = ireq->rmt_addr;
@@ -639,7 +639,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
639 639
640 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 640 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
641 bc = (struct rtattr *)(r + 1); 641 bc = (struct rtattr *)(r + 1);
642 entry.sport = inet->num; 642 entry.sport = inet->inet_num;
643 entry.userlocks = sk->sk_userlocks; 643 entry.userlocks = sk->sk_userlocks;
644 } 644 }
645 645
@@ -732,7 +732,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
732 continue; 732 continue;
733 } 733 }
734 734
735 if (r->id.idiag_sport != inet->sport && 735 if (r->id.idiag_sport != inet->inet_sport &&
736 r->id.idiag_sport) 736 r->id.idiag_sport)
737 goto next_listen; 737 goto next_listen;
738 738
@@ -774,7 +774,7 @@ skip_listen_ht:
774 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) 774 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
775 goto unlock; 775 goto unlock;
776 776
777 for (i = s_i; i < hashinfo->ehash_size; i++) { 777 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
778 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 778 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
779 spinlock_t *lock = inet_ehash_lockp(hashinfo, i); 779 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
780 struct sock *sk; 780 struct sock *sk;
@@ -797,10 +797,10 @@ skip_listen_ht:
797 goto next_normal; 797 goto next_normal;
798 if (!(r->idiag_states & (1 << sk->sk_state))) 798 if (!(r->idiag_states & (1 << sk->sk_state)))
799 goto next_normal; 799 goto next_normal;
800 if (r->id.idiag_sport != inet->sport && 800 if (r->id.idiag_sport != inet->inet_sport &&
801 r->id.idiag_sport) 801 r->id.idiag_sport)
802 goto next_normal; 802 goto next_normal;
803 if (r->id.idiag_dport != inet->dport && 803 if (r->id.idiag_dport != inet->inet_dport &&
804 r->id.idiag_dport) 804 r->id.idiag_dport)
805 goto next_normal; 805 goto next_normal;
806 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 806 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 625cc5f64c94..21e5e32d8c60 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
64 64
65 atomic_inc(&hashinfo->bsockets); 65 atomic_inc(&hashinfo->bsockets);
66 66
67 inet_sk(sk)->num = snum; 67 inet_sk(sk)->inet_num = snum;
68 sk_add_bind_node(sk, &tb->owners); 68 sk_add_bind_node(sk, &tb->owners);
69 tb->num_owners++; 69 tb->num_owners++;
70 inet_csk(sk)->icsk_bind_hash = tb; 70 inet_csk(sk)->icsk_bind_hash = tb;
@@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
76static void __inet_put_port(struct sock *sk) 76static void __inet_put_port(struct sock *sk)
77{ 77{
78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, 79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
80 hashinfo->bhash_size); 80 hashinfo->bhash_size);
81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
82 struct inet_bind_bucket *tb; 82 struct inet_bind_bucket *tb;
@@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk)
88 __sk_del_bind_node(sk); 88 __sk_del_bind_node(sk);
89 tb->num_owners--; 89 tb->num_owners--;
90 inet_csk(sk)->icsk_bind_hash = NULL; 90 inet_csk(sk)->icsk_bind_hash = NULL;
91 inet_sk(sk)->num = 0; 91 inet_sk(sk)->inet_num = 0;
92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
93 spin_unlock(&head->lock); 93 spin_unlock(&head->lock);
94} 94}
@@ -105,7 +105,7 @@ EXPORT_SYMBOL(inet_put_port);
105void __inet_inherit_port(struct sock *sk, struct sock *child) 105void __inet_inherit_port(struct sock *sk, struct sock *child)
106{ 106{
107 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 107 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
108 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, 108 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
109 table->bhash_size); 109 table->bhash_size);
110 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_hashbucket *head = &table->bhash[bhash];
111 struct inet_bind_bucket *tb; 111 struct inet_bind_bucket *tb;
@@ -126,9 +126,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
126 int score = -1; 126 int score = -1;
127 struct inet_sock *inet = inet_sk(sk); 127 struct inet_sock *inet = inet_sk(sk);
128 128
129 if (net_eq(sock_net(sk), net) && inet->num == hnum && 129 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
130 !ipv6_only_sock(sk)) { 130 !ipv6_only_sock(sk)) {
131 __be32 rcv_saddr = inet->rcv_saddr; 131 __be32 rcv_saddr = inet->inet_rcv_saddr;
132 score = sk->sk_family == PF_INET ? 1 : 0; 132 score = sk->sk_family == PF_INET ? 1 : 0;
133 if (rcv_saddr) { 133 if (rcv_saddr) {
134 if (rcv_saddr != daddr) 134 if (rcv_saddr != daddr)
@@ -209,7 +209,7 @@ struct sock * __inet_lookup_established(struct net *net,
209 * have wildcards anyways. 209 * have wildcards anyways.
210 */ 210 */
211 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 211 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
212 unsigned int slot = hash & (hashinfo->ehash_size - 1); 212 unsigned int slot = hash & hashinfo->ehash_mask;
213 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 213 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
214 214
215 rcu_read_lock(); 215 rcu_read_lock();
@@ -273,18 +273,20 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
273{ 273{
274 struct inet_hashinfo *hinfo = death_row->hashinfo; 274 struct inet_hashinfo *hinfo = death_row->hashinfo;
275 struct inet_sock *inet = inet_sk(sk); 275 struct inet_sock *inet = inet_sk(sk);
276 __be32 daddr = inet->rcv_saddr; 276 __be32 daddr = inet->inet_rcv_saddr;
277 __be32 saddr = inet->daddr; 277 __be32 saddr = inet->inet_daddr;
278 int dif = sk->sk_bound_dev_if; 278 int dif = sk->sk_bound_dev_if;
279 INET_ADDR_COOKIE(acookie, saddr, daddr) 279 INET_ADDR_COOKIE(acookie, saddr, daddr)
280 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 280 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
281 struct net *net = sock_net(sk); 281 struct net *net = sock_net(sk);
282 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 282 unsigned int hash = inet_ehashfn(net, daddr, lport,
283 saddr, inet->inet_dport);
283 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 284 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
284 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 285 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
285 struct sock *sk2; 286 struct sock *sk2;
286 const struct hlist_nulls_node *node; 287 const struct hlist_nulls_node *node;
287 struct inet_timewait_sock *tw; 288 struct inet_timewait_sock *tw;
289 int twrefcnt = 0;
288 290
289 spin_lock(lock); 291 spin_lock(lock);
290 292
@@ -312,25 +314,28 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
312unique: 314unique:
313 /* Must record num and sport now. Otherwise we will see 315 /* Must record num and sport now. Otherwise we will see
314 * in hash table socket with a funny identity. */ 316 * in hash table socket with a funny identity. */
315 inet->num = lport; 317 inet->inet_num = lport;
316 inet->sport = htons(lport); 318 inet->inet_sport = htons(lport);
317 sk->sk_hash = hash; 319 sk->sk_hash = hash;
318 WARN_ON(!sk_unhashed(sk)); 320 WARN_ON(!sk_unhashed(sk));
319 __sk_nulls_add_node_rcu(sk, &head->chain); 321 __sk_nulls_add_node_rcu(sk, &head->chain);
322 if (tw) {
323 twrefcnt = inet_twsk_unhash(tw);
324 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
325 }
320 spin_unlock(lock); 326 spin_unlock(lock);
327 if (twrefcnt)
328 inet_twsk_put(tw);
321 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 329 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
322 330
323 if (twp) { 331 if (twp) {
324 *twp = tw; 332 *twp = tw;
325 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
326 } else if (tw) { 333 } else if (tw) {
327 /* Silly. Should hash-dance instead... */ 334 /* Silly. Should hash-dance instead... */
328 inet_twsk_deschedule(tw, death_row); 335 inet_twsk_deschedule(tw, death_row);
329 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
330 336
331 inet_twsk_put(tw); 337 inet_twsk_put(tw);
332 } 338 }
333
334 return 0; 339 return 0;
335 340
336not_unique: 341not_unique:
@@ -341,8 +346,9 @@ not_unique:
341static inline u32 inet_sk_port_offset(const struct sock *sk) 346static inline u32 inet_sk_port_offset(const struct sock *sk)
342{ 347{
343 const struct inet_sock *inet = inet_sk(sk); 348 const struct inet_sock *inet = inet_sk(sk);
344 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 349 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
345 inet->dport); 350 inet->inet_daddr,
351 inet->inet_dport);
346} 352}
347 353
348void __inet_hash_nolisten(struct sock *sk) 354void __inet_hash_nolisten(struct sock *sk)
@@ -424,7 +430,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
424 void (*hash)(struct sock *sk)) 430 void (*hash)(struct sock *sk))
425{ 431{
426 struct inet_hashinfo *hinfo = death_row->hashinfo; 432 struct inet_hashinfo *hinfo = death_row->hashinfo;
427 const unsigned short snum = inet_sk(sk)->num; 433 const unsigned short snum = inet_sk(sk)->inet_num;
428 struct inet_bind_hashbucket *head; 434 struct inet_bind_hashbucket *head;
429 struct inet_bind_bucket *tb; 435 struct inet_bind_bucket *tb;
430 int ret; 436 int ret;
@@ -452,7 +458,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
452 * unique enough. 458 * unique enough.
453 */ 459 */
454 inet_bind_bucket_for_each(tb, node, &head->chain) { 460 inet_bind_bucket_for_each(tb, node, &head->chain) {
455 if (ib_net(tb) == net && tb->port == port) { 461 if (net_eq(ib_net(tb), net) &&
462 tb->port == port) {
456 if (tb->fastreuse >= 0) 463 if (tb->fastreuse >= 0)
457 goto next_port; 464 goto next_port;
458 WARN_ON(hlist_empty(&tb->owners)); 465 WARN_ON(hlist_empty(&tb->owners));
@@ -485,7 +492,7 @@ ok:
485 /* Head lock still held and bh's disabled */ 492 /* Head lock still held and bh's disabled */
486 inet_bind_hash(sk, tb, port); 493 inet_bind_hash(sk, tb, port);
487 if (sk_unhashed(sk)) { 494 if (sk_unhashed(sk)) {
488 inet_sk(sk)->sport = htons(port); 495 inet_sk(sk)->inet_sport = htons(port);
489 hash(sk); 496 hash(sk);
490 } 497 }
491 spin_unlock(&head->lock); 498 spin_unlock(&head->lock);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 6a667dae315e..47038cb6c138 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
64 if (iph->ihl != IPH_LEN_WO_OPTIONS) 64 if (iph->ihl != IPH_LEN_WO_OPTIONS)
65 return -1; 65 return -1;
66 66
67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack 67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
68 || tcph->rst || tcph->syn || tcph->fin) 68 tcph->rst || tcph->syn || tcph->fin)
69 return -1; 69 return -1;
70 70
71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) 71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
72 return -1; 72 return -1;
73 73
74 if (tcph->doff != TCPH_LEN_WO_OPTIONS 74 if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
75 && tcph->doff != TCPH_LEN_W_TIMESTAMP) 75 tcph->doff != TCPH_LEN_W_TIMESTAMP)
76 return -1; 76 return -1;
77 77
78 /* check tcp options (only timestamp allowed) */ 78 /* check tcp options (only timestamp allowed) */
@@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
262 struct iphdr *iph, 262 struct iphdr *iph,
263 struct tcphdr *tcph) 263 struct tcphdr *tcph)
264{ 264{
265 if ((lro_desc->iph->saddr != iph->saddr) 265 if ((lro_desc->iph->saddr != iph->saddr) ||
266 || (lro_desc->iph->daddr != iph->daddr) 266 (lro_desc->iph->daddr != iph->daddr) ||
267 || (lro_desc->tcph->source != tcph->source) 267 (lro_desc->tcph->source != tcph->source) ||
268 || (lro_desc->tcph->dest != tcph->dest)) 268 (lro_desc->tcph->dest != tcph->dest))
269 return -1; 269 return -1;
270 return 0; 270 return 0;
271} 271}
@@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
339 u64 flags; 339 u64 flags;
340 int vlan_hdr_len = 0; 340 int vlan_hdr_len = 0;
341 341
342 if (!lro_mgr->get_skb_header 342 if (!lro_mgr->get_skb_header ||
343 || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, 343 lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
344 &flags, priv)) 344 &flags, priv))
345 goto out; 345 goto out;
346 346
347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) 347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
@@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
351 if (!lro_desc) 351 if (!lro_desc)
352 goto out; 352 goto out;
353 353
354 if ((skb->protocol == htons(ETH_P_8021Q)) 354 if ((skb->protocol == htons(ETH_P_8021Q)) &&
355 && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) 355 !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
356 vlan_hdr_len = VLAN_HLEN; 356 vlan_hdr_len = VLAN_HLEN;
357 357
358 if (!lro_desc->active) { /* start new lro session */ 358 if (!lro_desc->active) { /* start new lro session */
@@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
446 int hdr_len = LRO_MAX_PG_HLEN; 446 int hdr_len = LRO_MAX_PG_HLEN;
447 int vlan_hdr_len = 0; 447 int vlan_hdr_len = 0;
448 448
449 if (!lro_mgr->get_frag_header 449 if (!lro_mgr->get_frag_header ||
450 || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, 450 lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
451 (void *)&tcph, &flags, priv)) { 451 (void *)&tcph, &flags, priv)) {
452 mac_hdr = page_address(frags->page) + frags->page_offset; 452 mac_hdr = page_address(frags->page) + frags->page_offset;
453 goto out1; 453 goto out1;
454 } 454 }
@@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
472 if (!skb) 472 if (!skb)
473 goto out; 473 goto out;
474 474
475 if ((skb->protocol == htons(ETH_P_8021Q)) 475 if ((skb->protocol == htons(ETH_P_8021Q)) &&
476 && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) 476 !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
477 vlan_hdr_len = VLAN_HLEN; 477 vlan_hdr_len = VLAN_HLEN;
478 478
479 iph = (void *)(skb->data + vlan_hdr_len); 479 iph = (void *)(skb->data + vlan_hdr_len);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 13f0781f35cd..0fdf45e4c90c 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -14,22 +14,33 @@
14#include <net/inet_timewait_sock.h> 14#include <net/inet_timewait_sock.h>
15#include <net/ip.h> 15#include <net/ip.h>
16 16
17
18/*
19 * unhash a timewait socket from established hash
20 * lock must be hold by caller
21 */
22int inet_twsk_unhash(struct inet_timewait_sock *tw)
23{
24 if (hlist_nulls_unhashed(&tw->tw_node))
25 return 0;
26
27 hlist_nulls_del_rcu(&tw->tw_node);
28 sk_nulls_node_init(&tw->tw_node);
29 return 1;
30}
31
17/* Must be called with locally disabled BHs. */ 32/* Must be called with locally disabled BHs. */
18static void __inet_twsk_kill(struct inet_timewait_sock *tw, 33static void __inet_twsk_kill(struct inet_timewait_sock *tw,
19 struct inet_hashinfo *hashinfo) 34 struct inet_hashinfo *hashinfo)
20{ 35{
21 struct inet_bind_hashbucket *bhead; 36 struct inet_bind_hashbucket *bhead;
22 struct inet_bind_bucket *tb; 37 struct inet_bind_bucket *tb;
38 int refcnt;
23 /* Unlink from established hashes. */ 39 /* Unlink from established hashes. */
24 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 40 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
25 41
26 spin_lock(lock); 42 spin_lock(lock);
27 if (hlist_nulls_unhashed(&tw->tw_node)) { 43 refcnt = inet_twsk_unhash(tw);
28 spin_unlock(lock);
29 return;
30 }
31 hlist_nulls_del_rcu(&tw->tw_node);
32 sk_nulls_node_init(&tw->tw_node);
33 spin_unlock(lock); 44 spin_unlock(lock);
34 45
35 /* Disassociate with bind bucket. */ 46 /* Disassociate with bind bucket. */
@@ -37,9 +48,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
37 hashinfo->bhash_size)]; 48 hashinfo->bhash_size)];
38 spin_lock(&bhead->lock); 49 spin_lock(&bhead->lock);
39 tb = tw->tw_tb; 50 tb = tw->tw_tb;
40 __hlist_del(&tw->tw_bind_node); 51 if (tb) {
41 tw->tw_tb = NULL; 52 __hlist_del(&tw->tw_bind_node);
42 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 53 tw->tw_tb = NULL;
54 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
55 refcnt++;
56 }
43 spin_unlock(&bhead->lock); 57 spin_unlock(&bhead->lock);
44#ifdef SOCK_REFCNT_DEBUG 58#ifdef SOCK_REFCNT_DEBUG
45 if (atomic_read(&tw->tw_refcnt) != 1) { 59 if (atomic_read(&tw->tw_refcnt) != 1) {
@@ -47,7 +61,10 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
47 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 61 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
48 } 62 }
49#endif 63#endif
50 inet_twsk_put(tw); 64 while (refcnt) {
65 inet_twsk_put(tw);
66 refcnt--;
67 }
51} 68}
52 69
53static noinline void inet_twsk_free(struct inet_timewait_sock *tw) 70static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -86,7 +103,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
86 Note, that any socket with inet->num != 0 MUST be bound in 103 Note, that any socket with inet->num != 0 MUST be bound in
87 binding cache, even if it is closed. 104 binding cache, even if it is closed.
88 */ 105 */
89 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, 106 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
90 hashinfo->bhash_size)]; 107 hashinfo->bhash_size)];
91 spin_lock(&bhead->lock); 108 spin_lock(&bhead->lock);
92 tw->tw_tb = icsk->icsk_bind_hash; 109 tw->tw_tb = icsk->icsk_bind_hash;
@@ -101,13 +118,22 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
101 * Should be done before removing sk from established chain 118 * Should be done before removing sk from established chain
102 * because readers are lockless and search established first. 119 * because readers are lockless and search established first.
103 */ 120 */
104 atomic_inc(&tw->tw_refcnt);
105 inet_twsk_add_node_rcu(tw, &ehead->twchain); 121 inet_twsk_add_node_rcu(tw, &ehead->twchain);
106 122
107 /* Step 3: Remove SK from established hash. */ 123 /* Step 3: Remove SK from established hash. */
108 if (__sk_nulls_del_node_init_rcu(sk)) 124 if (__sk_nulls_del_node_init_rcu(sk))
109 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 125 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
110 126
127 /*
128 * Notes :
129 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
130 * - We add one reference for the bhash link
131 * - We add one reference for the ehash link
132 * - We want this refcnt update done before allowing other
133 * threads to find this tw in ehash chain.
134 */
135 atomic_add(1 + 1 + 1, &tw->tw_refcnt);
136
111 spin_unlock(lock); 137 spin_unlock(lock);
112} 138}
113 139
@@ -124,14 +150,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
124 kmemcheck_annotate_bitfield(tw, flags); 150 kmemcheck_annotate_bitfield(tw, flags);
125 151
126 /* Give us an identity. */ 152 /* Give us an identity. */
127 tw->tw_daddr = inet->daddr; 153 tw->tw_daddr = inet->inet_daddr;
128 tw->tw_rcv_saddr = inet->rcv_saddr; 154 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
129 tw->tw_bound_dev_if = sk->sk_bound_dev_if; 155 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
130 tw->tw_num = inet->num; 156 tw->tw_num = inet->inet_num;
131 tw->tw_state = TCP_TIME_WAIT; 157 tw->tw_state = TCP_TIME_WAIT;
132 tw->tw_substate = state; 158 tw->tw_substate = state;
133 tw->tw_sport = inet->sport; 159 tw->tw_sport = inet->inet_sport;
134 tw->tw_dport = inet->dport; 160 tw->tw_dport = inet->inet_dport;
135 tw->tw_family = sk->sk_family; 161 tw->tw_family = sk->sk_family;
136 tw->tw_reuse = sk->sk_reuse; 162 tw->tw_reuse = sk->sk_reuse;
137 tw->tw_hash = sk->sk_hash; 163 tw->tw_hash = sk->sk_hash;
@@ -139,7 +165,12 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
139 tw->tw_transparent = inet->transparent; 165 tw->tw_transparent = inet->transparent;
140 tw->tw_prot = sk->sk_prot_creator; 166 tw->tw_prot = sk->sk_prot_creator;
141 twsk_net_set(tw, hold_net(sock_net(sk))); 167 twsk_net_set(tw, hold_net(sock_net(sk)));
142 atomic_set(&tw->tw_refcnt, 1); 168 /*
169 * Because we use RCU lookups, we should not set tw_refcnt
170 * to a non null value before everything is setup for this
171 * timewait socket.
172 */
173 atomic_set(&tw->tw_refcnt, 0);
143 inet_twsk_dead_node_init(tw); 174 inet_twsk_dead_node_init(tw);
144 __module_get(tw->tw_prot->owner); 175 __module_get(tw->tw_prot->owner);
145 } 176 }
@@ -421,37 +452,46 @@ out:
421 452
422EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); 453EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
423 454
424void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, 455void inet_twsk_purge(struct inet_hashinfo *hashinfo,
425 struct inet_timewait_death_row *twdr, int family) 456 struct inet_timewait_death_row *twdr, int family)
426{ 457{
427 struct inet_timewait_sock *tw; 458 struct inet_timewait_sock *tw;
428 struct sock *sk; 459 struct sock *sk;
429 struct hlist_nulls_node *node; 460 struct hlist_nulls_node *node;
430 int h; 461 unsigned int slot;
431 462
432 local_bh_disable(); 463 for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
433 for (h = 0; h < (hashinfo->ehash_size); h++) { 464 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
434 struct inet_ehash_bucket *head = 465restart_rcu:
435 inet_ehash_bucket(hashinfo, h); 466 rcu_read_lock();
436 spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
437restart: 467restart:
438 spin_lock(lock); 468 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
439 sk_nulls_for_each(sk, node, &head->twchain) {
440
441 tw = inet_twsk(sk); 469 tw = inet_twsk(sk);
442 if (!net_eq(twsk_net(tw), net) || 470 if ((tw->tw_family != family) ||
443 tw->tw_family != family) 471 atomic_read(&twsk_net(tw)->count))
472 continue;
473
474 if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
444 continue; 475 continue;
445 476
446 atomic_inc(&tw->tw_refcnt); 477 if (unlikely((tw->tw_family != family) ||
447 spin_unlock(lock); 478 atomic_read(&twsk_net(tw)->count))) {
479 inet_twsk_put(tw);
480 goto restart;
481 }
482
483 rcu_read_unlock();
448 inet_twsk_deschedule(tw, twdr); 484 inet_twsk_deschedule(tw, twdr);
449 inet_twsk_put(tw); 485 inet_twsk_put(tw);
450 486 goto restart_rcu;
451 goto restart;
452 } 487 }
453 spin_unlock(lock); 488 /* If the nulls value we got at the end of this lookup is
489 * not the expected one, we must restart lookup.
490 * We probably met an item that was moved to another chain.
491 */
492 if (get_nulls_value(node) != slot)
493 goto restart;
494 rcu_read_unlock();
454 } 495 }
455 local_bh_enable();
456} 496}
457EXPORT_SYMBOL_GPL(inet_twsk_purge); 497EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index b1fbe18feb5a..6bcfe52a9c87 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -67,9 +67,6 @@
67 * ip_id_count: idlock 67 * ip_id_count: idlock
68 */ 68 */
69 69
70/* Exported for inet_getid inline function. */
71DEFINE_SPINLOCK(inet_peer_idlock);
72
73static struct kmem_cache *peer_cachep __read_mostly; 70static struct kmem_cache *peer_cachep __read_mostly;
74 71
75#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
@@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
390 n->v4daddr = daddr; 387 n->v4daddr = daddr;
391 atomic_set(&n->refcnt, 1); 388 atomic_set(&n->refcnt, 1);
392 atomic_set(&n->rid, 0); 389 atomic_set(&n->rid, 0);
393 n->ip_id_count = secure_ip_id(daddr); 390 atomic_set(&n->ip_id_count, secure_ip_id(daddr));
394 n->tcp_ts_stamp = 0; 391 n->tcp_ts_stamp = 0;
395 392
396 write_lock_bh(&peer_pool_lock); 393 write_lock_bh(&peer_pool_lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 301a389fa7fa..86964b353c31 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -206,10 +206,11 @@ static void ip_expire(unsigned long arg)
206 struct sk_buff *head = qp->q.fragments; 206 struct sk_buff *head = qp->q.fragments;
207 207
208 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 208 /* Send an ICMP "Fragment Reassembly Timeout" message. */
209 if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { 209 rcu_read_lock();
210 head->dev = dev_get_by_index_rcu(net, qp->iif);
211 if (head->dev)
210 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 212 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
211 dev_put(head->dev); 213 rcu_read_unlock();
212 }
213 } 214 }
214out: 215out:
215 spin_unlock(&qp->q.lock); 216 spin_unlock(&qp->q.lock);
@@ -651,7 +652,7 @@ static int ip4_frags_ns_ctl_register(struct net *net)
651 struct ctl_table_header *hdr; 652 struct ctl_table_header *hdr;
652 653
653 table = ip4_frags_ns_ctl_table; 654 table = ip4_frags_ns_ctl_table;
654 if (net != &init_net) { 655 if (!net_eq(net, &init_net)) {
655 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); 656 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
656 if (table == NULL) 657 if (table == NULL)
657 goto err_alloc; 658 goto err_alloc;
@@ -669,7 +670,7 @@ static int ip4_frags_ns_ctl_register(struct net *net)
669 return 0; 670 return 0;
670 671
671err_reg: 672err_reg:
672 if (net != &init_net) 673 if (!net_eq(net, &init_net))
673 kfree(table); 674 kfree(table);
674err_alloc: 675err_alloc:
675 return -ENOMEM; 676 return -ENOMEM;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 143333852624..f36ce156cac6 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -125,7 +125,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 125
126#define HASH_SIZE 16 126#define HASH_SIZE 16
127 127
128static int ipgre_net_id; 128static int ipgre_net_id __read_mostly;
129struct ipgre_net { 129struct ipgre_net {
130 struct ip_tunnel *tunnels[4][HASH_SIZE]; 130 struct ip_tunnel *tunnels[4][HASH_SIZE];
131 131
@@ -156,8 +156,13 @@ struct ipgre_net {
156#define tunnels_r tunnels[2] 156#define tunnels_r tunnels[2]
157#define tunnels_l tunnels[1] 157#define tunnels_l tunnels[1]
158#define tunnels_wc tunnels[0] 158#define tunnels_wc tunnels[0]
159/*
160 * Locking : hash tables are protected by RCU and a spinlock
161 */
162static DEFINE_SPINLOCK(ipgre_lock);
159 163
160static DEFINE_RWLOCK(ipgre_lock); 164#define for_each_ip_tunnel_rcu(start) \
165 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
161 166
162/* Given src, dst and key, find appropriate for input tunnel. */ 167/* Given src, dst and key, find appropriate for input tunnel. */
163 168
@@ -175,7 +180,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
175 ARPHRD_ETHER : ARPHRD_IPGRE; 180 ARPHRD_ETHER : ARPHRD_IPGRE;
176 int score, cand_score = 4; 181 int score, cand_score = 4;
177 182
178 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { 183 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
179 if (local != t->parms.iph.saddr || 184 if (local != t->parms.iph.saddr ||
180 remote != t->parms.iph.daddr || 185 remote != t->parms.iph.daddr ||
181 key != t->parms.i_key || 186 key != t->parms.i_key ||
@@ -200,7 +205,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
200 } 205 }
201 } 206 }
202 207
203 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { 208 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
204 if (remote != t->parms.iph.daddr || 209 if (remote != t->parms.iph.daddr ||
205 key != t->parms.i_key || 210 key != t->parms.i_key ||
206 !(t->dev->flags & IFF_UP)) 211 !(t->dev->flags & IFF_UP))
@@ -224,7 +229,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
224 } 229 }
225 } 230 }
226 231
227 for (t = ign->tunnels_l[h1]; t; t = t->next) { 232 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
228 if ((local != t->parms.iph.saddr && 233 if ((local != t->parms.iph.saddr &&
229 (local != t->parms.iph.daddr || 234 (local != t->parms.iph.daddr ||
230 !ipv4_is_multicast(local))) || 235 !ipv4_is_multicast(local))) ||
@@ -250,7 +255,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
250 } 255 }
251 } 256 }
252 257
253 for (t = ign->tunnels_wc[h1]; t; t = t->next) { 258 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
254 if (t->parms.i_key != key || 259 if (t->parms.i_key != key ||
255 !(t->dev->flags & IFF_UP)) 260 !(t->dev->flags & IFF_UP))
256 continue; 261 continue;
@@ -276,8 +281,9 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
276 if (cand != NULL) 281 if (cand != NULL)
277 return cand; 282 return cand;
278 283
279 if (ign->fb_tunnel_dev->flags & IFF_UP) 284 dev = ign->fb_tunnel_dev;
280 return netdev_priv(ign->fb_tunnel_dev); 285 if (dev->flags & IFF_UP)
286 return netdev_priv(dev);
281 287
282 return NULL; 288 return NULL;
283} 289}
@@ -311,10 +317,10 @@ static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
311{ 317{
312 struct ip_tunnel **tp = ipgre_bucket(ign, t); 318 struct ip_tunnel **tp = ipgre_bucket(ign, t);
313 319
320 spin_lock_bh(&ipgre_lock);
314 t->next = *tp; 321 t->next = *tp;
315 write_lock_bh(&ipgre_lock); 322 rcu_assign_pointer(*tp, t);
316 *tp = t; 323 spin_unlock_bh(&ipgre_lock);
317 write_unlock_bh(&ipgre_lock);
318} 324}
319 325
320static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 326static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
@@ -323,9 +329,9 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
323 329
324 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 330 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
325 if (t == *tp) { 331 if (t == *tp) {
326 write_lock_bh(&ipgre_lock); 332 spin_lock_bh(&ipgre_lock);
327 *tp = t->next; 333 *tp = t->next;
328 write_unlock_bh(&ipgre_lock); 334 spin_unlock_bh(&ipgre_lock);
329 break; 335 break;
330 } 336 }
331 } 337 }
@@ -476,7 +482,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
476 break; 482 break;
477 } 483 }
478 484
479 read_lock(&ipgre_lock); 485 rcu_read_lock();
480 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 486 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
481 flags & GRE_KEY ? 487 flags & GRE_KEY ?
482 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 488 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
@@ -494,7 +500,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
494 t->err_count = 1; 500 t->err_count = 1;
495 t->err_time = jiffies; 501 t->err_time = jiffies;
496out: 502out:
497 read_unlock(&ipgre_lock); 503 rcu_read_unlock();
498 return; 504 return;
499} 505}
500 506
@@ -573,7 +579,7 @@ static int ipgre_rcv(struct sk_buff *skb)
573 579
574 gre_proto = *(__be16 *)(h + 2); 580 gre_proto = *(__be16 *)(h + 2);
575 581
576 read_lock(&ipgre_lock); 582 rcu_read_lock();
577 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 583 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
578 iph->saddr, iph->daddr, key, 584 iph->saddr, iph->daddr, key,
579 gre_proto))) { 585 gre_proto))) {
@@ -647,13 +653,13 @@ static int ipgre_rcv(struct sk_buff *skb)
647 ipgre_ecn_decapsulate(iph, skb); 653 ipgre_ecn_decapsulate(iph, skb);
648 654
649 netif_rx(skb); 655 netif_rx(skb);
650 read_unlock(&ipgre_lock); 656 rcu_read_unlock();
651 return(0); 657 return(0);
652 } 658 }
653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 659 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654 660
655drop: 661drop:
656 read_unlock(&ipgre_lock); 662 rcu_read_unlock();
657drop_nolock: 663drop_nolock:
658 kfree_skb(skb); 664 kfree_skb(skb);
659 return(0); 665 return(0);
@@ -662,7 +668,8 @@ drop_nolock:
662static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 668static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663{ 669{
664 struct ip_tunnel *tunnel = netdev_priv(dev); 670 struct ip_tunnel *tunnel = netdev_priv(dev);
665 struct net_device_stats *stats = &tunnel->dev->stats; 671 struct net_device_stats *stats = &dev->stats;
672 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666 struct iphdr *old_iph = ip_hdr(skb); 673 struct iphdr *old_iph = ip_hdr(skb);
667 struct iphdr *tiph; 674 struct iphdr *tiph;
668 u8 tos; 675 u8 tos;
@@ -810,7 +817,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
810 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 817 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
811 if (!new_skb) { 818 if (!new_skb) {
812 ip_rt_put(rt); 819 ip_rt_put(rt);
813 stats->tx_dropped++; 820 txq->tx_dropped++;
814 dev_kfree_skb(skb); 821 dev_kfree_skb(skb);
815 return NETDEV_TX_OK; 822 return NETDEV_TX_OK;
816 } 823 }
@@ -1283,33 +1290,27 @@ static const struct net_protocol ipgre_protocol = {
1283 .netns_ok = 1, 1290 .netns_ok = 1,
1284}; 1291};
1285 1292
1286static void ipgre_destroy_tunnels(struct ipgre_net *ign) 1293static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1287{ 1294{
1288 int prio; 1295 int prio;
1289 1296
1290 for (prio = 0; prio < 4; prio++) { 1297 for (prio = 0; prio < 4; prio++) {
1291 int h; 1298 int h;
1292 for (h = 0; h < HASH_SIZE; h++) { 1299 for (h = 0; h < HASH_SIZE; h++) {
1293 struct ip_tunnel *t; 1300 struct ip_tunnel *t = ign->tunnels[prio][h];
1294 while ((t = ign->tunnels[prio][h]) != NULL) 1301
1295 unregister_netdevice(t->dev); 1302 while (t != NULL) {
1303 unregister_netdevice_queue(t->dev, head);
1304 t = t->next;
1305 }
1296 } 1306 }
1297 } 1307 }
1298} 1308}
1299 1309
1300static int ipgre_init_net(struct net *net) 1310static int ipgre_init_net(struct net *net)
1301{ 1311{
1312 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1302 int err; 1313 int err;
1303 struct ipgre_net *ign;
1304
1305 err = -ENOMEM;
1306 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1307 if (ign == NULL)
1308 goto err_alloc;
1309
1310 err = net_assign_generic(net, ipgre_net_id, ign);
1311 if (err < 0)
1312 goto err_assign;
1313 1314
1314 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1315 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315 ipgre_tunnel_setup); 1316 ipgre_tunnel_setup);
@@ -1330,27 +1331,26 @@ static int ipgre_init_net(struct net *net)
1330err_reg_dev: 1331err_reg_dev:
1331 free_netdev(ign->fb_tunnel_dev); 1332 free_netdev(ign->fb_tunnel_dev);
1332err_alloc_dev: 1333err_alloc_dev:
1333 /* nothing */
1334err_assign:
1335 kfree(ign);
1336err_alloc:
1337 return err; 1334 return err;
1338} 1335}
1339 1336
1340static void ipgre_exit_net(struct net *net) 1337static void ipgre_exit_net(struct net *net)
1341{ 1338{
1342 struct ipgre_net *ign; 1339 struct ipgre_net *ign;
1340 LIST_HEAD(list);
1343 1341
1344 ign = net_generic(net, ipgre_net_id); 1342 ign = net_generic(net, ipgre_net_id);
1345 rtnl_lock(); 1343 rtnl_lock();
1346 ipgre_destroy_tunnels(ign); 1344 ipgre_destroy_tunnels(ign, &list);
1345 unregister_netdevice_many(&list);
1347 rtnl_unlock(); 1346 rtnl_unlock();
1348 kfree(ign);
1349} 1347}
1350 1348
1351static struct pernet_operations ipgre_net_ops = { 1349static struct pernet_operations ipgre_net_ops = {
1352 .init = ipgre_init_net, 1350 .init = ipgre_init_net,
1353 .exit = ipgre_exit_net, 1351 .exit = ipgre_exit_net,
1352 .id = &ipgre_net_id,
1353 .size = sizeof(struct ipgre_net),
1354}; 1354};
1355 1355
1356static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1356static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1471,7 +1471,7 @@ static void ipgre_tap_setup(struct net_device *dev)
1471 dev->features |= NETIF_F_NETNS_LOCAL; 1471 dev->features |= NETIF_F_NETNS_LOCAL;
1472} 1472}
1473 1473
1474static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], 1474static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475 struct nlattr *data[]) 1475 struct nlattr *data[])
1476{ 1476{
1477 struct ip_tunnel *nt; 1477 struct ip_tunnel *nt;
@@ -1670,7 +1670,7 @@ static int __init ipgre_init(void)
1670 return -EAGAIN; 1670 return -EAGAIN;
1671 } 1671 }
1672 1672
1673 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); 1673 err = register_pernet_device(&ipgre_net_ops);
1674 if (err < 0) 1674 if (err < 0)
1675 goto gen_device_failed; 1675 goto gen_device_failed;
1676 1676
@@ -1688,7 +1688,7 @@ out:
1688tap_ops_failed: 1688tap_ops_failed:
1689 rtnl_link_unregister(&ipgre_link_ops); 1689 rtnl_link_unregister(&ipgre_link_ops);
1690rtnl_link_failed: 1690rtnl_link_failed:
1691 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); 1691 unregister_pernet_device(&ipgre_net_ops);
1692gen_device_failed: 1692gen_device_failed:
1693 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1693 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1694 goto out; 1694 goto out;
@@ -1698,7 +1698,7 @@ static void __exit ipgre_fini(void)
1698{ 1698{
1699 rtnl_link_unregister(&ipgre_tap_ops); 1699 rtnl_link_unregister(&ipgre_tap_ops);
1700 rtnl_link_unregister(&ipgre_link_ops); 1700 rtnl_link_unregister(&ipgre_link_ops);
1701 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); 1701 unregister_pernet_device(&ipgre_net_ops);
1702 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1702 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1703 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704} 1704}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 6c98b43badf4..c29de9879fda 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -161,10 +161,10 @@ int ip_call_ra_chain(struct sk_buff *skb)
161 /* If socket is bound to an interface, only report 161 /* If socket is bound to an interface, only report
162 * the packet if it came from that interface. 162 * the packet if it came from that interface.
163 */ 163 */
164 if (sk && inet_sk(sk)->num == protocol && 164 if (sk && inet_sk(sk)->inet_num == protocol &&
165 (!sk->sk_bound_dev_if || 165 (!sk->sk_bound_dev_if ||
166 sk->sk_bound_dev_if == dev->ifindex) && 166 sk->sk_bound_dev_if == dev->ifindex) &&
167 sock_net(sk) == dev_net(dev)) { 167 net_eq(sock_net(sk), dev_net(dev))) {
168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { 169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
170 read_unlock(&ip_ra_lock); 170 read_unlock(&ip_ra_lock);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f9895180f481..e34013a78ef4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -264,9 +264,11 @@ int ip_mc_output(struct sk_buff *skb)
264 264
265 This check is duplicated in ip_mr_input at the moment. 265 This check is duplicated in ip_mr_input at the moment.
266 */ 266 */
267 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) 267 &&
268 ((rt->rt_flags & RTCF_LOCAL) ||
269 !(IPCB(skb)->flags & IPSKB_FORWARDED))
268#endif 270#endif
269 ) { 271 ) {
270 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 272 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
271 if (newskb) 273 if (newskb)
272 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, 274 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
@@ -329,7 +331,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
329 __be32 daddr; 331 __be32 daddr;
330 332
331 /* Use correct destination address if we have options. */ 333 /* Use correct destination address if we have options. */
332 daddr = inet->daddr; 334 daddr = inet->inet_daddr;
333 if(opt && opt->srr) 335 if(opt && opt->srr)
334 daddr = opt->faddr; 336 daddr = opt->faddr;
335 337
@@ -338,13 +340,13 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
338 .mark = sk->sk_mark, 340 .mark = sk->sk_mark,
339 .nl_u = { .ip4_u = 341 .nl_u = { .ip4_u =
340 { .daddr = daddr, 342 { .daddr = daddr,
341 .saddr = inet->saddr, 343 .saddr = inet->inet_saddr,
342 .tos = RT_CONN_FLAGS(sk) } }, 344 .tos = RT_CONN_FLAGS(sk) } },
343 .proto = sk->sk_protocol, 345 .proto = sk->sk_protocol,
344 .flags = inet_sk_flowi_flags(sk), 346 .flags = inet_sk_flowi_flags(sk),
345 .uli_u = { .ports = 347 .uli_u = { .ports =
346 { .sport = inet->sport, 348 { .sport = inet->inet_sport,
347 .dport = inet->dport } } }; 349 .dport = inet->inet_dport } } };
348 350
349 /* If this fails, retransmit mechanism of transport layer will 351 /* If this fails, retransmit mechanism of transport layer will
350 * keep trying until route appears or the connection times 352 * keep trying until route appears or the connection times
@@ -379,7 +381,7 @@ packet_routed:
379 381
380 if (opt && opt->optlen) { 382 if (opt && opt->optlen) {
381 iph->ihl += opt->optlen >> 2; 383 iph->ihl += opt->optlen >> 2;
382 ip_options_build(skb, opt, inet->daddr, rt, 0); 384 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
383 } 385 }
384 386
385 ip_select_ident_more(iph, &rt->u.dst, sk, 387 ip_select_ident_more(iph, &rt->u.dst, sk,
@@ -501,8 +503,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
501 if (skb->sk) { 503 if (skb->sk) {
502 frag->sk = skb->sk; 504 frag->sk = skb->sk;
503 frag->destructor = sock_wfree; 505 frag->destructor = sock_wfree;
504 truesizes += frag->truesize;
505 } 506 }
507 truesizes += frag->truesize;
506 } 508 }
507 509
508 /* Everything is OK. Generate! */ 510 /* Everything is OK. Generate! */
@@ -846,7 +848,8 @@ int ip_append_data(struct sock *sk,
846 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 848 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
847 849
848 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 850 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
849 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); 851 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
852 mtu-exthdrlen);
850 return -EMSGSIZE; 853 return -EMSGSIZE;
851 } 854 }
852 855
@@ -1100,7 +1103,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1100 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1103 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1101 1104
1102 if (inet->cork.length + size > 0xFFFF - fragheaderlen) { 1105 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1103 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); 1106 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1104 return -EMSGSIZE; 1107 return -EMSGSIZE;
1105 } 1108 }
1106 1109
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e982b5c1ee17..cafad9baff03 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -245,7 +245,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
245{ 245{
246 struct ip_ra_chain *ra, *new_ra, **rap; 246 struct ip_ra_chain *ra, *new_ra, **rap;
247 247
248 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) 248 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
249 return -EINVAL; 249 return -EINVAL;
250 250
251 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 251 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
@@ -480,7 +480,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
480 case IP_OPTIONS: 480 case IP_OPTIONS:
481 { 481 {
482 struct ip_options *opt = NULL; 482 struct ip_options *opt = NULL;
483 if (optlen > 40 || optlen < 0) 483 if (optlen > 40)
484 goto e_inval; 484 goto e_inval;
485 err = ip_options_get_from_user(sock_net(sk), &opt, 485 err = ip_options_get_from_user(sock_net(sk), &opt,
486 optval, optlen); 486 optval, optlen);
@@ -492,7 +492,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
492 if (sk->sk_family == PF_INET || 492 if (sk->sk_family == PF_INET ||
493 (!((1 << sk->sk_state) & 493 (!((1 << sk->sk_state) &
494 (TCPF_LISTEN | TCPF_CLOSE)) && 494 (TCPF_LISTEN | TCPF_CLOSE)) &&
495 inet->daddr != LOOPBACK4_IPV6)) { 495 inet->inet_daddr != LOOPBACK4_IPV6)) {
496#endif 496#endif
497 if (inet->opt) 497 if (inet->opt)
498 icsk->icsk_ext_hdr_len -= inet->opt->optlen; 498 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
@@ -575,7 +575,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
575 inet->hdrincl = val ? 1 : 0; 575 inet->hdrincl = val ? 1 : 0;
576 break; 576 break;
577 case IP_MTU_DISCOVER: 577 case IP_MTU_DISCOVER:
578 if (val < 0 || val > 3) 578 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
579 goto e_inval; 579 goto e_inval;
580 inet->pmtudisc = val; 580 inet->pmtudisc = val;
581 break; 581 break;
@@ -1180,8 +1180,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1180 if (inet->cmsg_flags & IP_CMSG_PKTINFO) { 1180 if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
1181 struct in_pktinfo info; 1181 struct in_pktinfo info;
1182 1182
1183 info.ipi_addr.s_addr = inet->rcv_saddr; 1183 info.ipi_addr.s_addr = inet->inet_rcv_saddr;
1184 info.ipi_spec_dst.s_addr = inet->rcv_saddr; 1184 info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
1185 info.ipi_ifindex = inet->mc_index; 1185 info.ipi_ifindex = inet->mc_index;
1186 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 1186 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
1187 } 1187 }
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f8d04c256454..4e08b7f2331c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1172,10 +1172,9 @@ static int __init ic_dynamic(void)
1172 schedule_timeout_uninterruptible(1); 1172 schedule_timeout_uninterruptible(1);
1173#ifdef IPCONFIG_DHCP 1173#ifdef IPCONFIG_DHCP
1174 /* DHCP isn't done until we get a DHCPACK. */ 1174 /* DHCP isn't done until we get a DHCPACK. */
1175 if ((ic_got_reply & IC_BOOTP) 1175 if ((ic_got_reply & IC_BOOTP) &&
1176 && (ic_proto_enabled & IC_USE_DHCP) 1176 (ic_proto_enabled & IC_USE_DHCP) &&
1177 && ic_dhcp_msgtype != DHCPACK) 1177 ic_dhcp_msgtype != DHCPACK) {
1178 {
1179 ic_got_reply = 0; 1178 ic_got_reply = 0;
1180 printk(","); 1179 printk(",");
1181 continue; 1180 continue;
@@ -1344,9 +1343,9 @@ static int __init ip_auto_config(void)
1344 */ 1343 */
1345 if (ic_myaddr == NONE || 1344 if (ic_myaddr == NONE ||
1346#ifdef CONFIG_ROOT_NFS 1345#ifdef CONFIG_ROOT_NFS
1347 (root_server_addr == NONE 1346 (root_server_addr == NONE &&
1348 && ic_servaddr == NONE 1347 ic_servaddr == NONE &&
1349 && ROOT_DEV == Root_NFS) || 1348 ROOT_DEV == Root_NFS) ||
1350#endif 1349#endif
1351 ic_first_dev->next) { 1350 ic_first_dev->next) {
1352#ifdef IPCONFIG_DYNAMIC 1351#ifdef IPCONFIG_DYNAMIC
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ae40ed1ba560..eda04fed3379 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -119,7 +119,7 @@
119#define HASH_SIZE 16 119#define HASH_SIZE 16
120#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 120#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
121 121
122static int ipip_net_id; 122static int ipip_net_id __read_mostly;
123struct ipip_net { 123struct ipip_net {
124 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 124 struct ip_tunnel *tunnels_r_l[HASH_SIZE];
125 struct ip_tunnel *tunnels_r[HASH_SIZE]; 125 struct ip_tunnel *tunnels_r[HASH_SIZE];
@@ -134,7 +134,13 @@ static void ipip_fb_tunnel_init(struct net_device *dev);
134static void ipip_tunnel_init(struct net_device *dev); 134static void ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136 136
137static DEFINE_RWLOCK(ipip_lock); 137/*
138 * Locking : hash tables are protected by RCU and a spinlock
139 */
140static DEFINE_SPINLOCK(ipip_lock);
141
142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
138 144
139static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 145static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
140 __be32 remote, __be32 local) 146 __be32 remote, __be32 local)
@@ -144,20 +150,21 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
144 struct ip_tunnel *t; 150 struct ip_tunnel *t;
145 struct ipip_net *ipn = net_generic(net, ipip_net_id); 151 struct ipip_net *ipn = net_generic(net, ipip_net_id);
146 152
147 for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { 153 for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
148 if (local == t->parms.iph.saddr && 154 if (local == t->parms.iph.saddr &&
149 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 155 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
150 return t; 156 return t;
151 } 157
152 for (t = ipn->tunnels_r[h0]; t; t = t->next) { 158 for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
153 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 159 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
154 return t; 160 return t;
155 } 161
156 for (t = ipn->tunnels_l[h1]; t; t = t->next) { 162 for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
157 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) 163 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
158 return t; 164 return t;
159 } 165
160 if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) 166 t = rcu_dereference(ipn->tunnels_wc[0]);
167 if (t && (t->dev->flags&IFF_UP))
161 return t; 168 return t;
162 return NULL; 169 return NULL;
163} 170}
@@ -193,9 +200,9 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
193 200
194 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 201 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
195 if (t == *tp) { 202 if (t == *tp) {
196 write_lock_bh(&ipip_lock); 203 spin_lock_bh(&ipip_lock);
197 *tp = t->next; 204 *tp = t->next;
198 write_unlock_bh(&ipip_lock); 205 spin_unlock_bh(&ipip_lock);
199 break; 206 break;
200 } 207 }
201 } 208 }
@@ -205,10 +212,10 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
205{ 212{
206 struct ip_tunnel **tp = ipip_bucket(ipn, t); 213 struct ip_tunnel **tp = ipip_bucket(ipn, t);
207 214
215 spin_lock_bh(&ipip_lock);
208 t->next = *tp; 216 t->next = *tp;
209 write_lock_bh(&ipip_lock); 217 rcu_assign_pointer(*tp, t);
210 *tp = t; 218 spin_unlock_bh(&ipip_lock);
211 write_unlock_bh(&ipip_lock);
212} 219}
213 220
214static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 221static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -267,9 +274,9 @@ static void ipip_tunnel_uninit(struct net_device *dev)
267 struct ipip_net *ipn = net_generic(net, ipip_net_id); 274 struct ipip_net *ipn = net_generic(net, ipip_net_id);
268 275
269 if (dev == ipn->fb_tunnel_dev) { 276 if (dev == ipn->fb_tunnel_dev) {
270 write_lock_bh(&ipip_lock); 277 spin_lock_bh(&ipip_lock);
271 ipn->tunnels_wc[0] = NULL; 278 ipn->tunnels_wc[0] = NULL;
272 write_unlock_bh(&ipip_lock); 279 spin_unlock_bh(&ipip_lock);
273 } else 280 } else
274 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 281 ipip_tunnel_unlink(ipn, netdev_priv(dev));
275 dev_put(dev); 282 dev_put(dev);
@@ -318,7 +325,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
318 325
319 err = -ENOENT; 326 err = -ENOENT;
320 327
321 read_lock(&ipip_lock); 328 rcu_read_lock();
322 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 329 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
323 if (t == NULL || t->parms.iph.daddr == 0) 330 if (t == NULL || t->parms.iph.daddr == 0)
324 goto out; 331 goto out;
@@ -333,7 +340,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
333 t->err_count = 1; 340 t->err_count = 1;
334 t->err_time = jiffies; 341 t->err_time = jiffies;
335out: 342out:
336 read_unlock(&ipip_lock); 343 rcu_read_unlock();
337 return err; 344 return err;
338} 345}
339 346
@@ -351,11 +358,11 @@ static int ipip_rcv(struct sk_buff *skb)
351 struct ip_tunnel *tunnel; 358 struct ip_tunnel *tunnel;
352 const struct iphdr *iph = ip_hdr(skb); 359 const struct iphdr *iph = ip_hdr(skb);
353 360
354 read_lock(&ipip_lock); 361 rcu_read_lock();
355 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 362 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
356 iph->saddr, iph->daddr)) != NULL) { 363 iph->saddr, iph->daddr)) != NULL) {
357 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 364 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
358 read_unlock(&ipip_lock); 365 rcu_read_unlock();
359 kfree_skb(skb); 366 kfree_skb(skb);
360 return 0; 367 return 0;
361 } 368 }
@@ -374,10 +381,10 @@ static int ipip_rcv(struct sk_buff *skb)
374 nf_reset(skb); 381 nf_reset(skb);
375 ipip_ecn_decapsulate(iph, skb); 382 ipip_ecn_decapsulate(iph, skb);
376 netif_rx(skb); 383 netif_rx(skb);
377 read_unlock(&ipip_lock); 384 rcu_read_unlock();
378 return 0; 385 return 0;
379 } 386 }
380 read_unlock(&ipip_lock); 387 rcu_read_unlock();
381 388
382 return -1; 389 return -1;
383} 390}
@@ -390,7 +397,8 @@ static int ipip_rcv(struct sk_buff *skb)
390static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 397static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
391{ 398{
392 struct ip_tunnel *tunnel = netdev_priv(dev); 399 struct ip_tunnel *tunnel = netdev_priv(dev);
393 struct net_device_stats *stats = &tunnel->dev->stats; 400 struct net_device_stats *stats = &dev->stats;
401 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
394 struct iphdr *tiph = &tunnel->parms.iph; 402 struct iphdr *tiph = &tunnel->parms.iph;
395 u8 tos = tunnel->parms.iph.tos; 403 u8 tos = tunnel->parms.iph.tos;
396 __be16 df = tiph->frag_off; 404 __be16 df = tiph->frag_off;
@@ -480,7 +488,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
480 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 488 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
481 if (!new_skb) { 489 if (!new_skb) {
482 ip_rt_put(rt); 490 ip_rt_put(rt);
483 stats->tx_dropped++; 491 txq->tx_dropped++;
484 dev_kfree_skb(skb); 492 dev_kfree_skb(skb);
485 return NETDEV_TX_OK; 493 return NETDEV_TX_OK;
486 } 494 }
@@ -748,33 +756,27 @@ static struct xfrm_tunnel ipip_handler = {
748static const char banner[] __initconst = 756static const char banner[] __initconst =
749 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 757 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
750 758
751static void ipip_destroy_tunnels(struct ipip_net *ipn) 759static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
752{ 760{
753 int prio; 761 int prio;
754 762
755 for (prio = 1; prio < 4; prio++) { 763 for (prio = 1; prio < 4; prio++) {
756 int h; 764 int h;
757 for (h = 0; h < HASH_SIZE; h++) { 765 for (h = 0; h < HASH_SIZE; h++) {
758 struct ip_tunnel *t; 766 struct ip_tunnel *t = ipn->tunnels[prio][h];
759 while ((t = ipn->tunnels[prio][h]) != NULL) 767
760 unregister_netdevice(t->dev); 768 while (t != NULL) {
769 unregister_netdevice_queue(t->dev, head);
770 t = t->next;
771 }
761 } 772 }
762 } 773 }
763} 774}
764 775
765static int ipip_init_net(struct net *net) 776static int ipip_init_net(struct net *net)
766{ 777{
778 struct ipip_net *ipn = net_generic(net, ipip_net_id);
767 int err; 779 int err;
768 struct ipip_net *ipn;
769
770 err = -ENOMEM;
771 ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
772 if (ipn == NULL)
773 goto err_alloc;
774
775 err = net_assign_generic(net, ipip_net_id, ipn);
776 if (err < 0)
777 goto err_assign;
778 780
779 ipn->tunnels[0] = ipn->tunnels_wc; 781 ipn->tunnels[0] = ipn->tunnels_wc;
780 ipn->tunnels[1] = ipn->tunnels_l; 782 ipn->tunnels[1] = ipn->tunnels_l;
@@ -801,27 +803,26 @@ err_reg_dev:
801 free_netdev(ipn->fb_tunnel_dev); 803 free_netdev(ipn->fb_tunnel_dev);
802err_alloc_dev: 804err_alloc_dev:
803 /* nothing */ 805 /* nothing */
804err_assign:
805 kfree(ipn);
806err_alloc:
807 return err; 806 return err;
808} 807}
809 808
810static void ipip_exit_net(struct net *net) 809static void ipip_exit_net(struct net *net)
811{ 810{
812 struct ipip_net *ipn; 811 struct ipip_net *ipn = net_generic(net, ipip_net_id);
812 LIST_HEAD(list);
813 813
814 ipn = net_generic(net, ipip_net_id);
815 rtnl_lock(); 814 rtnl_lock();
816 ipip_destroy_tunnels(ipn); 815 ipip_destroy_tunnels(ipn, &list);
817 unregister_netdevice(ipn->fb_tunnel_dev); 816 unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
817 unregister_netdevice_many(&list);
818 rtnl_unlock(); 818 rtnl_unlock();
819 kfree(ipn);
820} 819}
821 820
822static struct pernet_operations ipip_net_ops = { 821static struct pernet_operations ipip_net_ops = {
823 .init = ipip_init_net, 822 .init = ipip_init_net,
824 .exit = ipip_exit_net, 823 .exit = ipip_exit_net,
824 .id = &ipip_net_id,
825 .size = sizeof(struct ipip_net),
825}; 826};
826 827
827static int __init ipip_init(void) 828static int __init ipip_init(void)
@@ -835,7 +836,7 @@ static int __init ipip_init(void)
835 return -EAGAIN; 836 return -EAGAIN;
836 } 837 }
837 838
838 err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); 839 err = register_pernet_device(&ipip_net_ops);
839 if (err) 840 if (err)
840 xfrm4_tunnel_deregister(&ipip_handler, AF_INET); 841 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
841 842
@@ -847,7 +848,7 @@ static void __exit ipip_fini(void)
847 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) 848 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
848 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 849 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
849 850
850 unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); 851 unregister_pernet_device(&ipip_net_ops);
851} 852}
852 853
853module_init(ipip_init); 854module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 99508d66a642..54596f73eff5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -275,7 +275,8 @@ failure:
275 * @notify: Set to 1, if the caller is a notifier_call 275 * @notify: Set to 1, if the caller is a notifier_call
276 */ 276 */
277 277
278static int vif_delete(struct net *net, int vifi, int notify) 278static int vif_delete(struct net *net, int vifi, int notify,
279 struct list_head *head)
279{ 280{
280 struct vif_device *v; 281 struct vif_device *v;
281 struct net_device *dev; 282 struct net_device *dev;
@@ -319,7 +320,7 @@ static int vif_delete(struct net *net, int vifi, int notify)
319 } 320 }
320 321
321 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 322 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
322 unregister_netdevice(dev); 323 unregister_netdevice_queue(dev, head);
323 324
324 dev_put(dev); 325 dev_put(dev);
325 return 0; 326 return 0;
@@ -469,8 +470,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
469 return err; 470 return err;
470 } 471 }
471 break; 472 break;
473
474 case VIFF_USE_IFINDEX:
472 case 0: 475 case 0:
473 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 476 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
477 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
478 if (dev && dev->ip_ptr == NULL) {
479 dev_put(dev);
480 return -EADDRNOTAVAIL;
481 }
482 } else
483 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
484
474 if (!dev) 485 if (!dev)
475 return -EADDRNOTAVAIL; 486 return -EADDRNOTAVAIL;
476 err = dev_set_allmulti(dev, 1); 487 err = dev_set_allmulti(dev, 1);
@@ -862,14 +873,16 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
862static void mroute_clean_tables(struct net *net) 873static void mroute_clean_tables(struct net *net)
863{ 874{
864 int i; 875 int i;
876 LIST_HEAD(list);
865 877
866 /* 878 /*
867 * Shut down all active vif entries 879 * Shut down all active vif entries
868 */ 880 */
869 for (i = 0; i < net->ipv4.maxvif; i++) { 881 for (i = 0; i < net->ipv4.maxvif; i++) {
870 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) 882 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
871 vif_delete(net, i, 0); 883 vif_delete(net, i, 0, &list);
872 } 884 }
885 unregister_netdevice_many(&list);
873 886
874 /* 887 /*
875 * Wipe the cache 888 * Wipe the cache
@@ -948,7 +961,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
948 switch (optname) { 961 switch (optname) {
949 case MRT_INIT: 962 case MRT_INIT:
950 if (sk->sk_type != SOCK_RAW || 963 if (sk->sk_type != SOCK_RAW ||
951 inet_sk(sk)->num != IPPROTO_IGMP) 964 inet_sk(sk)->inet_num != IPPROTO_IGMP)
952 return -EOPNOTSUPP; 965 return -EOPNOTSUPP;
953 if (optlen != sizeof(int)) 966 if (optlen != sizeof(int))
954 return -ENOPROTOOPT; 967 return -ENOPROTOOPT;
@@ -985,7 +998,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
985 if (optname == MRT_ADD_VIF) { 998 if (optname == MRT_ADD_VIF) {
986 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); 999 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
987 } else { 1000 } else {
988 ret = vif_delete(net, vif.vifc_vifi, 0); 1001 ret = vif_delete(net, vif.vifc_vifi, 0, NULL);
989 } 1002 }
990 rtnl_unlock(); 1003 rtnl_unlock();
991 return ret; 1004 return ret;
@@ -1148,6 +1161,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1148 struct net *net = dev_net(dev); 1161 struct net *net = dev_net(dev);
1149 struct vif_device *v; 1162 struct vif_device *v;
1150 int ct; 1163 int ct;
1164 LIST_HEAD(list);
1151 1165
1152 if (!net_eq(dev_net(dev), net)) 1166 if (!net_eq(dev_net(dev), net))
1153 return NOTIFY_DONE; 1167 return NOTIFY_DONE;
@@ -1157,8 +1171,9 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1157 v = &net->ipv4.vif_table[0]; 1171 v = &net->ipv4.vif_table[0];
1158 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { 1172 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
1159 if (v->dev == dev) 1173 if (v->dev == dev)
1160 vif_delete(net, ct, 1); 1174 vif_delete(net, ct, 1, &list);
1161 } 1175 }
1176 unregister_netdevice_many(&list);
1162 return NOTIFY_DONE; 1177 return NOTIFY_DONE;
1163} 1178}
1164 1179
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index db52c0cb0c11..c14623fc4d5e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -155,10 +155,10 @@ static int nf_ip_reroute(struct sk_buff *skb,
155 if (entry->hook == NF_INET_LOCAL_OUT) { 155 if (entry->hook == NF_INET_LOCAL_OUT) {
156 const struct iphdr *iph = ip_hdr(skb); 156 const struct iphdr *iph = ip_hdr(skb);
157 157
158 if (!(iph->tos == rt_info->tos 158 if (!(iph->tos == rt_info->tos &&
159 && skb->mark == rt_info->mark 159 skb->mark == rt_info->mark &&
160 && iph->daddr == rt_info->daddr 160 iph->daddr == rt_info->daddr &&
161 && iph->saddr == rt_info->saddr)) 161 iph->saddr == rt_info->saddr))
162 return ip_route_me_harder(skb, RTN_UNSPEC); 162 return ip_route_me_harder(skb, RTN_UNSPEC);
163 } 163 }
164 return 0; 164 return 0;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 27774c99d888..06632762ba5f 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -384,11 +384,11 @@ static int mark_source_chains(struct xt_table_info *newinfo,
384 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); 384 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
385 385
386 /* Unconditional return/END. */ 386 /* Unconditional return/END. */
387 if ((e->target_offset == sizeof(struct arpt_entry) 387 if ((e->target_offset == sizeof(struct arpt_entry) &&
388 && (strcmp(t->target.u.user.name, 388 (strcmp(t->target.u.user.name,
389 ARPT_STANDARD_TARGET) == 0) 389 ARPT_STANDARD_TARGET) == 0) &&
390 && t->verdict < 0 390 t->verdict < 0 && unconditional(&e->arp)) ||
391 && unconditional(&e->arp)) || visited) { 391 visited) {
392 unsigned int oldpos, size; 392 unsigned int oldpos, size;
393 393
394 if ((strcmp(t->target.u.user.name, 394 if ((strcmp(t->target.u.user.name,
@@ -427,8 +427,8 @@ static int mark_source_chains(struct xt_table_info *newinfo,
427 int newpos = t->verdict; 427 int newpos = t->verdict;
428 428
429 if (strcmp(t->target.u.user.name, 429 if (strcmp(t->target.u.user.name,
430 ARPT_STANDARD_TARGET) == 0 430 ARPT_STANDARD_TARGET) == 0 &&
431 && newpos >= 0) { 431 newpos >= 0) {
432 if (newpos > newinfo->size - 432 if (newpos > newinfo->size -
433 sizeof(struct arpt_entry)) { 433 sizeof(struct arpt_entry)) {
434 duprintf("mark_source_chains: " 434 duprintf("mark_source_chains: "
@@ -559,8 +559,8 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
559{ 559{
560 unsigned int h; 560 unsigned int h;
561 561
562 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 562 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
563 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { 563 (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
564 duprintf("Bad offset %p\n", e); 564 duprintf("Bad offset %p\n", e);
565 return -EINVAL; 565 return -EINVAL;
566 } 566 }
@@ -1251,8 +1251,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1251 int ret, off, h; 1251 int ret, off, h;
1252 1252
1253 duprintf("check_compat_entry_size_and_hooks %p\n", e); 1253 duprintf("check_compat_entry_size_and_hooks %p\n", e);
1254 if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 1254 if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
1255 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { 1255 (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
1256 duprintf("Bad offset %p, limit = %p\n", e, limit); 1256 duprintf("Bad offset %p, limit = %p\n", e, limit);
1257 return -EINVAL; 1257 return -EINVAL;
1258 } 1258 }
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c9f90e8c5191..2855f1f38cbc 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -497,10 +497,9 @@ ipq_rcv_nl_event(struct notifier_block *this,
497{ 497{
498 struct netlink_notify *n = ptr; 498 struct netlink_notify *n = ptr;
499 499
500 if (event == NETLINK_URELEASE && 500 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
501 n->protocol == NETLINK_FIREWALL && n->pid) {
502 write_lock_bh(&queue_lock); 501 write_lock_bh(&queue_lock);
503 if ((n->net == &init_net) && (n->pid == peer_pid)) 502 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
504 __ipq_reset(); 503 __ipq_reset();
505 write_unlock_bh(&queue_lock); 504 write_unlock_bh(&queue_lock);
506 } 505 }
@@ -621,7 +620,7 @@ cleanup_netlink_notifier:
621static void __exit ip_queue_fini(void) 620static void __exit ip_queue_fini(void)
622{ 621{
623 nf_unregister_queue_handlers(&nfqh); 622 nf_unregister_queue_handlers(&nfqh);
624 synchronize_net(); 623
625 ipq_flush(NULL, 0); 624 ipq_flush(NULL, 0);
626 625
627#ifdef CONFIG_SYSCTL 626#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cde755d5eeab..572330a552ef 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -89,9 +89,9 @@ ip_packet_match(const struct iphdr *ip,
89#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) 89#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
90 90
91 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, 91 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
92 IPT_INV_SRCIP) 92 IPT_INV_SRCIP) ||
93 || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, 93 FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
94 IPT_INV_DSTIP)) { 94 IPT_INV_DSTIP)) {
95 dprintf("Source or dest mismatch.\n"); 95 dprintf("Source or dest mismatch.\n");
96 96
97 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", 97 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
@@ -122,8 +122,8 @@ ip_packet_match(const struct iphdr *ip,
122 } 122 }
123 123
124 /* Check specific protocol */ 124 /* Check specific protocol */
125 if (ipinfo->proto 125 if (ipinfo->proto &&
126 && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { 126 FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
127 dprintf("Packet protocol %hi does not match %hi.%s\n", 127 dprintf("Packet protocol %hi does not match %hi.%s\n",
128 ip->protocol, ipinfo->proto, 128 ip->protocol, ipinfo->proto,
129 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); 129 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
@@ -246,11 +246,11 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
246 } else if (s == e) { 246 } else if (s == e) {
247 (*rulenum)++; 247 (*rulenum)++;
248 248
249 if (s->target_offset == sizeof(struct ipt_entry) 249 if (s->target_offset == sizeof(struct ipt_entry) &&
250 && strcmp(t->target.u.kernel.target->name, 250 strcmp(t->target.u.kernel.target->name,
251 IPT_STANDARD_TARGET) == 0 251 IPT_STANDARD_TARGET) == 0 &&
252 && t->verdict < 0 252 t->verdict < 0 &&
253 && unconditional(&s->ip)) { 253 unconditional(&s->ip)) {
254 /* Tail of chains: STANDARD target (return/policy) */ 254 /* Tail of chains: STANDARD target (return/policy) */
255 *comment = *chainname == hookname 255 *comment = *chainname == hookname
256 ? comments[NF_IP_TRACE_COMMENT_POLICY] 256 ? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -388,8 +388,8 @@ ipt_do_table(struct sk_buff *skb,
388 back = get_entry(table_base, back->comefrom); 388 back = get_entry(table_base, back->comefrom);
389 continue; 389 continue;
390 } 390 }
391 if (table_base + v != ipt_next_entry(e) 391 if (table_base + v != ipt_next_entry(e) &&
392 && !(e->ip.flags & IPT_F_GOTO)) { 392 !(e->ip.flags & IPT_F_GOTO)) {
393 /* Save old back ptr in next entry */ 393 /* Save old back ptr in next entry */
394 struct ipt_entry *next = ipt_next_entry(e); 394 struct ipt_entry *next = ipt_next_entry(e);
395 next->comefrom = (void *)back - table_base; 395 next->comefrom = (void *)back - table_base;
@@ -473,11 +473,11 @@ mark_source_chains(struct xt_table_info *newinfo,
473 e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); 473 e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
474 474
475 /* Unconditional return/END. */ 475 /* Unconditional return/END. */
476 if ((e->target_offset == sizeof(struct ipt_entry) 476 if ((e->target_offset == sizeof(struct ipt_entry) &&
477 && (strcmp(t->target.u.user.name, 477 (strcmp(t->target.u.user.name,
478 IPT_STANDARD_TARGET) == 0) 478 IPT_STANDARD_TARGET) == 0) &&
479 && t->verdict < 0 479 t->verdict < 0 && unconditional(&e->ip)) ||
480 && unconditional(&e->ip)) || visited) { 480 visited) {
481 unsigned int oldpos, size; 481 unsigned int oldpos, size;
482 482
483 if ((strcmp(t->target.u.user.name, 483 if ((strcmp(t->target.u.user.name,
@@ -524,8 +524,8 @@ mark_source_chains(struct xt_table_info *newinfo,
524 int newpos = t->verdict; 524 int newpos = t->verdict;
525 525
526 if (strcmp(t->target.u.user.name, 526 if (strcmp(t->target.u.user.name,
527 IPT_STANDARD_TARGET) == 0 527 IPT_STANDARD_TARGET) == 0 &&
528 && newpos >= 0) { 528 newpos >= 0) {
529 if (newpos > newinfo->size - 529 if (newpos > newinfo->size -
530 sizeof(struct ipt_entry)) { 530 sizeof(struct ipt_entry)) {
531 duprintf("mark_source_chains: " 531 duprintf("mark_source_chains: "
@@ -735,8 +735,8 @@ check_entry_size_and_hooks(struct ipt_entry *e,
735{ 735{
736 unsigned int h; 736 unsigned int h;
737 737
738 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 738 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
739 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { 739 (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
740 duprintf("Bad offset %p\n", e); 740 duprintf("Bad offset %p\n", e);
741 return -EINVAL; 741 return -EINVAL;
742 } 742 }
@@ -1548,8 +1548,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1548 int ret, off, h; 1548 int ret, off, h;
1549 1549
1550 duprintf("check_compat_entry_size_and_hooks %p\n", e); 1550 duprintf("check_compat_entry_size_and_hooks %p\n", e);
1551 if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 1551 if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
1552 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { 1552 (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
1553 duprintf("Bad offset %p, limit = %p\n", e, limit); 1553 duprintf("Bad offset %p, limit = %p\n", e, limit);
1554 return -EINVAL; 1554 return -EINVAL;
1555 } 1555 }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2e4f98b85524..40ca2d240abb 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -303,9 +303,9 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
303 303
304 /* special case: ICMP error handling. conntrack distinguishes between 304 /* special case: ICMP error handling. conntrack distinguishes between
305 * error messages (RELATED) and information requests (see below) */ 305 * error messages (RELATED) and information requests (see below) */
306 if (ip_hdr(skb)->protocol == IPPROTO_ICMP 306 if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
307 && (ctinfo == IP_CT_RELATED 307 (ctinfo == IP_CT_RELATED ||
308 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) 308 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
309 return XT_CONTINUE; 309 return XT_CONTINUE;
310 310
311 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 311 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -362,8 +362,8 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
362 return false; 362 return false;
363 363
364 } 364 }
365 if (e->ip.dmsk.s_addr != htonl(0xffffffff) 365 if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
366 || e->ip.dst.s_addr == 0) { 366 e->ip.dst.s_addr == 0) {
367 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); 367 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
368 return false; 368 return false;
369 } 369 }
@@ -495,14 +495,14 @@ arp_mangle(unsigned int hook,
495 struct clusterip_config *c; 495 struct clusterip_config *c;
496 496
497 /* we don't care about non-ethernet and non-ipv4 ARP */ 497 /* we don't care about non-ethernet and non-ipv4 ARP */
498 if (arp->ar_hrd != htons(ARPHRD_ETHER) 498 if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
499 || arp->ar_pro != htons(ETH_P_IP) 499 arp->ar_pro != htons(ETH_P_IP) ||
500 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 500 arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
501 return NF_ACCEPT; 501 return NF_ACCEPT;
502 502
503 /* we only want to mangle arp requests and replies */ 503 /* we only want to mangle arp requests and replies */
504 if (arp->ar_op != htons(ARPOP_REPLY) 504 if (arp->ar_op != htons(ARPOP_REPLY) &&
505 && arp->ar_op != htons(ARPOP_REQUEST)) 505 arp->ar_op != htons(ARPOP_REQUEST))
506 return NF_ACCEPT; 506 return NF_ACCEPT;
507 507
508 payload = (void *)(arp+1); 508 payload = (void *)(arp+1);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index f7e2fa0974dc..549e206cdd42 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -85,8 +85,8 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
85 if (!set_ect_ip(skb, einfo)) 85 if (!set_ect_ip(skb, einfo))
86 return NF_DROP; 86 return NF_DROP;
87 87
88 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) 88 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
89 && ip_hdr(skb)->protocol == IPPROTO_TCP) 89 ip_hdr(skb)->protocol == IPPROTO_TCP)
90 if (!set_ect_tcp(skb, einfo)) 90 if (!set_ect_tcp(skb, einfo))
91 return NF_DROP; 91 return NF_DROP;
92 92
@@ -108,8 +108,8 @@ static bool ecn_tg_check(const struct xt_tgchk_param *par)
108 einfo->ip_ect); 108 einfo->ip_ect);
109 return false; 109 return false;
110 } 110 }
111 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) 111 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
112 && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { 112 (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
113 printk(KERN_WARNING "ECN: cannot use TCP operations on a " 113 printk(KERN_WARNING "ECN: cannot use TCP operations on a "
114 "non-tcp rule\n"); 114 "non-tcp rule\n");
115 return false; 115 return false;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index acc44c69eb68..ee128efa1c8d 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -74,8 +74,8 @@ static void dump_packet(const struct nf_loginfo *info,
74 if (ntohs(ih->frag_off) & IP_OFFSET) 74 if (ntohs(ih->frag_off) & IP_OFFSET)
75 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 75 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
76 76
77 if ((logflags & IPT_LOG_IPOPT) 77 if ((logflags & IPT_LOG_IPOPT) &&
78 && ih->ihl * 4 > sizeof(struct iphdr)) { 78 ih->ihl * 4 > sizeof(struct iphdr)) {
79 const unsigned char *op; 79 const unsigned char *op;
80 unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; 80 unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
81 unsigned int i, optsize; 81 unsigned int i, optsize;
@@ -146,8 +146,8 @@ static void dump_packet(const struct nf_loginfo *info,
146 /* Max length: 11 "URGP=65535 " */ 146 /* Max length: 11 "URGP=65535 " */
147 printk("URGP=%u ", ntohs(th->urg_ptr)); 147 printk("URGP=%u ", ntohs(th->urg_ptr));
148 148
149 if ((logflags & IPT_LOG_TCPOPT) 149 if ((logflags & IPT_LOG_TCPOPT) &&
150 && th->doff * 4 > sizeof(struct tcphdr)) { 150 th->doff * 4 > sizeof(struct tcphdr)) {
151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; 151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
152 const unsigned char *op; 152 const unsigned char *op;
153 unsigned int i, optsize; 153 unsigned int i, optsize;
@@ -238,9 +238,9 @@ static void dump_packet(const struct nf_loginfo *info,
238 printk("TYPE=%u CODE=%u ", ich->type, ich->code); 238 printk("TYPE=%u CODE=%u ", ich->type, ich->code);
239 239
240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
241 if (ich->type <= NR_ICMP_TYPES 241 if (ich->type <= NR_ICMP_TYPES &&
242 && required_len[ich->type] 242 required_len[ich->type] &&
243 && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { 243 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
244 printk("INCOMPLETE [%u bytes] ", 244 printk("INCOMPLETE [%u bytes] ",
245 skb->len - iphoff - ih->ihl*4); 245 skb->len - iphoff - ih->ihl*4);
246 break; 246 break;
@@ -276,8 +276,8 @@ static void dump_packet(const struct nf_loginfo *info,
276 } 276 }
277 277
278 /* Max length: 10 "MTU=65535 " */ 278 /* Max length: 10 "MTU=65535 " */
279 if (ich->type == ICMP_DEST_UNREACH 279 if (ich->type == ICMP_DEST_UNREACH &&
280 && ich->code == ICMP_FRAG_NEEDED) 280 ich->code == ICMP_FRAG_NEEDED)
281 printk("MTU=%u ", ntohs(ich->un.frag.mtu)); 281 printk("MTU=%u ", ntohs(ich->un.frag.mtu));
282 } 282 }
283 break; 283 break;
@@ -407,8 +407,8 @@ ipt_log_packet(u_int8_t pf,
407 if (in && !out) { 407 if (in && !out) {
408 /* MAC logging for input chain only. */ 408 /* MAC logging for input chain only. */
409 printk("MAC="); 409 printk("MAC=");
410 if (skb->dev && skb->dev->hard_header_len 410 if (skb->dev && skb->dev->hard_header_len &&
411 && skb->mac_header != skb->network_header) { 411 skb->mac_header != skb->network_header) {
412 int i; 412 int i;
413 const unsigned char *p = skb_mac_header(skb); 413 const unsigned char *p = skb_mac_header(skb);
414 for (i = 0; i < skb->dev->hard_header_len; i++,p++) 414 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index dada0863946d..650b54042b01 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -59,8 +59,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
59 ct = nf_ct_get(skb, &ctinfo); 59 ct = nf_ct_get(skb, &ctinfo);
60 nat = nfct_nat(ct); 60 nat = nfct_nat(ct);
61 61
62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED 62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
63 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 63 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
64 64
65 /* Source address is 0.0.0.0 - locally generated packet that is 65 /* Source address is 0.0.0.0 - locally generated packet that is
66 * probably not supposed to be masqueraded. 66 * probably not supposed to be masqueraded.
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index c93ae44bff2a..5113b8f1a379 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -184,8 +184,8 @@ static bool reject_tg_check(const struct xt_tgchk_param *par)
184 return false; 184 return false;
185 } else if (rejinfo->with == IPT_TCP_RESET) { 185 } else if (rejinfo->with == IPT_TCP_RESET) {
186 /* Must specify that it's a TCP packet */ 186 /* Must specify that it's a TCP packet */
187 if (e->ip.proto != IPPROTO_TCP 187 if (e->ip.proto != IPPROTO_TCP ||
188 || (e->ip.invflags & XT_INV_PROTO)) { 188 (e->ip.invflags & XT_INV_PROTO)) {
189 printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); 189 printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
190 return false; 190 return false;
191 } 191 }
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d32cc4bb328a..399061c3fd7d 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -226,9 +226,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
226 else 226 else
227 *(pm->prefix) = '\0'; 227 *(pm->prefix) = '\0';
228 228
229 if (in && in->hard_header_len > 0 229 if (in && in->hard_header_len > 0 &&
230 && skb->mac_header != skb->network_header 230 skb->mac_header != skb->network_header &&
231 && in->hard_header_len <= ULOG_MAC_LEN) { 231 in->hard_header_len <= ULOG_MAC_LEN) {
232 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); 232 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
233 pm->mac_len = in->hard_header_len; 233 pm->mac_len = in->hard_header_len;
234 } else 234 } else
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 6289b64144c6..2a1e56b71908 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -96,8 +96,8 @@ static bool ecn_mt_check(const struct xt_mtchk_param *par)
96 if (info->invert & IPT_ECN_OP_MATCH_MASK) 96 if (info->invert & IPT_ECN_OP_MATCH_MASK)
97 return false; 97 return false;
98 98
99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) 99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
100 && ip->proto != IPPROTO_TCP) { 100 ip->proto != IPPROTO_TCP) {
101 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" 101 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
102 " non-tcp packets\n"); 102 " non-tcp packets\n");
103 return false; 103 return false;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 036047f9b0f2..fae78c3076c4 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -130,8 +130,8 @@ ipt_local_hook(unsigned int hook,
130 u_int32_t mark; 130 u_int32_t mark;
131 131
132 /* root is playing with raw sockets. */ 132 /* root is playing with raw sockets. */
133 if (skb->len < sizeof(struct iphdr) 133 if (skb->len < sizeof(struct iphdr) ||
134 || ip_hdrlen(skb) < sizeof(struct iphdr)) 134 ip_hdrlen(skb) < sizeof(struct iphdr))
135 return NF_ACCEPT; 135 return NF_ACCEPT;
136 136
137 /* Save things which could affect route */ 137 /* Save things which could affect route */
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 99eb76c65d25..3bd3d6388da5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -94,8 +94,8 @@ ipt_local_out_hook(unsigned int hook,
94 int (*okfn)(struct sk_buff *)) 94 int (*okfn)(struct sk_buff *))
95{ 95{
96 /* Somebody is playing with raw sockets. */ 96 /* Somebody is playing with raw sockets. */
97 if (skb->len < sizeof(struct iphdr) 97 if (skb->len < sizeof(struct iphdr) ||
98 || ip_hdrlen(skb) < sizeof(struct iphdr)) 98 ip_hdrlen(skb) < sizeof(struct iphdr))
99 return NF_ACCEPT; 99 return NF_ACCEPT;
100 return ipt_do_table(skb, hook, in, out, 100 return ipt_do_table(skb, hook, in, out,
101 dev_net(out)->ipv4.iptable_security); 101 dev_net(out)->ipv4.iptable_security);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 092d68f916e6..d171b123a656 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -247,10 +247,10 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
247 struct nf_conntrack_tuple tuple; 247 struct nf_conntrack_tuple tuple;
248 248
249 memset(&tuple, 0, sizeof(tuple)); 249 memset(&tuple, 0, sizeof(tuple));
250 tuple.src.u3.ip = inet->rcv_saddr; 250 tuple.src.u3.ip = inet->inet_rcv_saddr;
251 tuple.src.u.tcp.port = inet->sport; 251 tuple.src.u.tcp.port = inet->inet_sport;
252 tuple.dst.u3.ip = inet->daddr; 252 tuple.dst.u3.ip = inet->inet_daddr;
253 tuple.dst.u.tcp.port = inet->dport; 253 tuple.dst.u.tcp.port = inet->inet_dport;
254 tuple.src.l3num = PF_INET; 254 tuple.src.l3num = PF_INET;
255 tuple.dst.protonum = sk->sk_protocol; 255 tuple.dst.protonum = sk->sk_protocol;
256 256
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 9072058778b8..7afd39b5b781 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -54,8 +54,8 @@ static const u_int8_t invmap[] = {
54static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, 54static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
55 const struct nf_conntrack_tuple *orig) 55 const struct nf_conntrack_tuple *orig)
56{ 56{
57 if (orig->dst.u.icmp.type >= sizeof(invmap) 57 if (orig->dst.u.icmp.type >= sizeof(invmap) ||
58 || !invmap[orig->dst.u.icmp.type]) 58 !invmap[orig->dst.u.icmp.type])
59 return false; 59 return false;
60 60
61 tuple->src.u.icmp.id = orig->src.u.icmp.id; 61 tuple->src.u.icmp.id = orig->src.u.icmp.id;
@@ -101,8 +101,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
101 [ICMP_ADDRESS] = 1 101 [ICMP_ADDRESS] = 1
102 }; 102 };
103 103
104 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) 104 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
105 || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { 105 !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
106 /* Can't create a new ICMP `conn' with this. */ 106 /* Can't create a new ICMP `conn' with this. */
107 pr_debug("icmp: can't create new conn with type %u\n", 107 pr_debug("icmp: can't create new conn with type %u\n",
108 ct->tuplehash[0].tuple.dst.u.icmp.type); 108 ct->tuplehash[0].tuple.dst.u.icmp.type);
@@ -201,11 +201,11 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
201 } 201 }
202 202
203 /* Need to track icmp error message? */ 203 /* Need to track icmp error message? */
204 if (icmph->type != ICMP_DEST_UNREACH 204 if (icmph->type != ICMP_DEST_UNREACH &&
205 && icmph->type != ICMP_SOURCE_QUENCH 205 icmph->type != ICMP_SOURCE_QUENCH &&
206 && icmph->type != ICMP_TIME_EXCEEDED 206 icmph->type != ICMP_TIME_EXCEEDED &&
207 && icmph->type != ICMP_PARAMETERPROB 207 icmph->type != ICMP_PARAMETERPROB &&
208 && icmph->type != ICMP_REDIRECT) 208 icmph->type != ICMP_REDIRECT)
209 return NF_ACCEPT; 209 return NF_ACCEPT;
210 210
211 return icmp_error_message(net, skb, ctinfo, hooknum); 211 return icmp_error_message(net, skb, ctinfo, hooknum);
@@ -238,17 +238,17 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
238static int icmp_nlattr_to_tuple(struct nlattr *tb[], 238static int icmp_nlattr_to_tuple(struct nlattr *tb[],
239 struct nf_conntrack_tuple *tuple) 239 struct nf_conntrack_tuple *tuple)
240{ 240{
241 if (!tb[CTA_PROTO_ICMP_TYPE] 241 if (!tb[CTA_PROTO_ICMP_TYPE] ||
242 || !tb[CTA_PROTO_ICMP_CODE] 242 !tb[CTA_PROTO_ICMP_CODE] ||
243 || !tb[CTA_PROTO_ICMP_ID]) 243 !tb[CTA_PROTO_ICMP_ID])
244 return -EINVAL; 244 return -EINVAL;
245 245
246 tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); 246 tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
247 tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); 247 tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
248 tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); 248 tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
249 249
250 if (tuple->dst.u.icmp.type >= sizeof(invmap) 250 if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
251 || !invmap[tuple->dst.u.icmp.type]) 251 !invmap[tuple->dst.u.icmp.type])
252 return -EINVAL; 252 return -EINVAL;
253 253
254 return 0; 254 return 0;
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index f9520fa3aba9..7f10a6be0191 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -41,18 +41,14 @@ adjust_tcp_sequence(u32 seq,
41 struct nf_conn *ct, 41 struct nf_conn *ct,
42 enum ip_conntrack_info ctinfo) 42 enum ip_conntrack_info ctinfo)
43{ 43{
44 int dir; 44 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
45 struct nf_nat_seq *this_way, *other_way;
46 struct nf_conn_nat *nat = nfct_nat(ct); 45 struct nf_conn_nat *nat = nfct_nat(ct);
46 struct nf_nat_seq *this_way = &nat->seq[dir];
47 47
48 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", seq, seq); 48 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
49 49 seq, sizediff);
50 dir = CTINFO2DIR(ctinfo);
51
52 this_way = &nat->seq[dir];
53 other_way = &nat->seq[!dir];
54 50
55 pr_debug("nf_nat_resize_packet: Seq_offset before: "); 51 pr_debug("adjust_tcp_sequence: Seq_offset before: ");
56 DUMP_OFFSET(this_way); 52 DUMP_OFFSET(this_way);
57 53
58 spin_lock_bh(&nf_nat_seqofs_lock); 54 spin_lock_bh(&nf_nat_seqofs_lock);
@@ -63,13 +59,13 @@ adjust_tcp_sequence(u32 seq,
63 * retransmit */ 59 * retransmit */
64 if (this_way->offset_before == this_way->offset_after || 60 if (this_way->offset_before == this_way->offset_after ||
65 before(this_way->correction_pos, seq)) { 61 before(this_way->correction_pos, seq)) {
66 this_way->correction_pos = seq; 62 this_way->correction_pos = seq;
67 this_way->offset_before = this_way->offset_after; 63 this_way->offset_before = this_way->offset_after;
68 this_way->offset_after += sizediff; 64 this_way->offset_after += sizediff;
69 } 65 }
70 spin_unlock_bh(&nf_nat_seqofs_lock); 66 spin_unlock_bh(&nf_nat_seqofs_lock);
71 67
72 pr_debug("nf_nat_resize_packet: Seq_offset after: "); 68 pr_debug("adjust_tcp_sequence: Seq_offset after: ");
73 DUMP_OFFSET(this_way); 69 DUMP_OFFSET(this_way);
74} 70}
75 71
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5f41d017ddd8..5678e9562c15 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -197,11 +197,11 @@ nf_nat_out(unsigned int hooknum,
197 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 197 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
198 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 198 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
199 199
200 if (ct->tuplehash[dir].tuple.src.u3.ip != 200 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
201 ct->tuplehash[!dir].tuple.dst.u3.ip 201 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
202 || ct->tuplehash[dir].tuple.src.u.all != 202 (ct->tuplehash[dir].tuple.src.u.all !=
203 ct->tuplehash[!dir].tuple.dst.u.all 203 ct->tuplehash[!dir].tuple.dst.u.all)
204 ) 204 )
205 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; 205 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
206 } 206 }
207#endif 207#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ab996f9c0fe0..ce154b47f1da 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -87,7 +87,7 @@ void raw_hash_sk(struct sock *sk)
87 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 87 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
88 struct hlist_head *head; 88 struct hlist_head *head;
89 89
90 head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; 90 head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
91 91
92 write_lock_bh(&h->lock); 92 write_lock_bh(&h->lock);
93 sk_add_node(sk, head); 93 sk_add_node(sk, head);
@@ -115,9 +115,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
115 sk_for_each_from(sk, node) { 115 sk_for_each_from(sk, node) {
116 struct inet_sock *inet = inet_sk(sk); 116 struct inet_sock *inet = inet_sk(sk);
117 117
118 if (net_eq(sock_net(sk), net) && inet->num == num && 118 if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
119 !(inet->daddr && inet->daddr != raddr) && 119 !(inet->inet_daddr && inet->inet_daddr != raddr) &&
120 !(inet->rcv_saddr && inet->rcv_saddr != laddr) && 120 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
121 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) 121 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
122 goto found; /* gotcha */ 122 goto found; /* gotcha */
123 } 123 }
@@ -292,7 +292,6 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
292 /* Charge it to the socket. */ 292 /* Charge it to the socket. */
293 293
294 if (sock_queue_rcv_skb(sk, skb) < 0) { 294 if (sock_queue_rcv_skb(sk, skb) < 0) {
295 atomic_inc(&sk->sk_drops);
296 kfree_skb(skb); 295 kfree_skb(skb);
297 return NET_RX_DROP; 296 return NET_RX_DROP;
298 } 297 }
@@ -327,7 +326,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
327 int err; 326 int err;
328 327
329 if (length > rt->u.dst.dev->mtu) { 328 if (length > rt->u.dst.dev->mtu) {
330 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, 329 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
331 rt->u.dst.dev->mtu); 330 rt->u.dst.dev->mtu);
332 return -EMSGSIZE; 331 return -EMSGSIZE;
333 } 332 }
@@ -500,10 +499,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
500 err = -EDESTADDRREQ; 499 err = -EDESTADDRREQ;
501 if (sk->sk_state != TCP_ESTABLISHED) 500 if (sk->sk_state != TCP_ESTABLISHED)
502 goto out; 501 goto out;
503 daddr = inet->daddr; 502 daddr = inet->inet_daddr;
504 } 503 }
505 504
506 ipc.addr = inet->saddr; 505 ipc.addr = inet->inet_saddr;
507 ipc.opt = NULL; 506 ipc.opt = NULL;
508 ipc.shtx.flags = 0; 507 ipc.shtx.flags = 0;
509 ipc.oif = sk->sk_bound_dev_if; 508 ipc.oif = sk->sk_bound_dev_if;
@@ -645,9 +644,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
645 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && 644 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
646 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) 645 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
647 goto out; 646 goto out;
648 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 647 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
649 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 648 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
650 inet->saddr = 0; /* Use device */ 649 inet->inet_saddr = 0; /* Use device */
651 sk_dst_reset(sk); 650 sk_dst_reset(sk);
652 ret = 0; 651 ret = 0;
653out: return ret; 652out: return ret;
@@ -692,7 +691,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
692 if (err) 691 if (err)
693 goto done; 692 goto done;
694 693
695 sock_recv_timestamp(msg, sk, skb); 694 sock_recv_ts_and_drops(msg, sk, skb);
696 695
697 /* Copy the address. */ 696 /* Copy the address. */
698 if (sin) { 697 if (sin) {
@@ -717,7 +716,7 @@ static int raw_init(struct sock *sk)
717{ 716{
718 struct raw_sock *rp = raw_sk(sk); 717 struct raw_sock *rp = raw_sk(sk);
719 718
720 if (inet_sk(sk)->num == IPPROTO_ICMP) 719 if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
721 memset(&rp->filter, 0, sizeof(rp->filter)); 720 memset(&rp->filter, 0, sizeof(rp->filter));
722 return 0; 721 return 0;
723} 722}
@@ -754,7 +753,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
754 char __user *optval, unsigned int optlen) 753 char __user *optval, unsigned int optlen)
755{ 754{
756 if (optname == ICMP_FILTER) { 755 if (optname == ICMP_FILTER) {
757 if (inet_sk(sk)->num != IPPROTO_ICMP) 756 if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
758 return -EOPNOTSUPP; 757 return -EOPNOTSUPP;
759 else 758 else
760 return raw_seticmpfilter(sk, optval, optlen); 759 return raw_seticmpfilter(sk, optval, optlen);
@@ -784,7 +783,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname,
784 char __user *optval, int __user *optlen) 783 char __user *optval, int __user *optlen)
785{ 784{
786 if (optname == ICMP_FILTER) { 785 if (optname == ICMP_FILTER) {
787 if (inet_sk(sk)->num != IPPROTO_ICMP) 786 if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
788 return -EOPNOTSUPP; 787 return -EOPNOTSUPP;
789 else 788 else
790 return raw_geticmpfilter(sk, optval, optlen); 789 return raw_geticmpfilter(sk, optval, optlen);
@@ -943,10 +942,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop);
943static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 942static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
944{ 943{
945 struct inet_sock *inet = inet_sk(sp); 944 struct inet_sock *inet = inet_sk(sp);
946 __be32 dest = inet->daddr, 945 __be32 dest = inet->inet_daddr,
947 src = inet->rcv_saddr; 946 src = inet->inet_rcv_saddr;
948 __u16 destp = 0, 947 __u16 destp = 0,
949 srcp = inet->num; 948 srcp = inet->inet_num;
950 949
951 seq_printf(seq, "%4d: %08X:%04X %08X:%04X" 950 seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
952 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", 951 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0d9f584a3811..e446496f564f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -513,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = {
513}; 513};
514 514
515#ifdef CONFIG_NET_CLS_ROUTE 515#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 516static int rt_acct_proc_show(struct seq_file *m, void *v)
517 int length, int *eof, void *data) 517{
518{ 518 struct ip_rt_acct *dst, *src;
519 unsigned int i; 519 unsigned int i, j;
520 520
521 if ((offset & 3) || (length & 3)) 521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 return -EIO; 522 if (!dst)
523 523 return -ENOMEM;
524 if (offset >= sizeof(struct ip_rt_acct) * 256) { 524
525 *eof = 1; 525 for_each_possible_cpu(i) {
526 return 0; 526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 } 527 for (j = 0; j < 256; j++) {
528 528 dst[j].o_bytes += src[j].o_bytes;
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 529 dst[j].o_packets += src[j].o_packets;
530 length = sizeof(struct ip_rt_acct) * 256 - offset; 530 dst[j].i_bytes += src[j].i_bytes;
531 *eof = 1; 531 dst[j].i_packets += src[j].i_packets;
532 }
532 } 533 }
533 534
534 offset /= sizeof(u32); 535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
535 536 kfree(dst);
536 if (length > 0) { 537 return 0;
537 u32 *dst = (u32 *) buffer; 538}
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545 539
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; 540static int rt_acct_proc_open(struct inode *inode, struct file *file)
547 for (j = 0; j < length/4; j++) 541{
548 dst[j] += src[j]; 542 return single_open(file, rt_acct_proc_show, NULL);
549 }
550 }
551 return length;
552} 543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
553#endif 552#endif
554 553
555static int __net_init ip_rt_do_proc_init(struct net *net) 554static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -567,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 goto err2; 566 goto err2;
568 567
569#ifdef CONFIG_NET_CLS_ROUTE 568#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net, 569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
571 ip_rt_acct_read, NULL);
572 if (!pde) 570 if (!pde)
573 goto err3; 571 goto err3;
574#endif 572#endif
@@ -703,7 +701,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
703 701
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 702static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{ 703{
706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); 704 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
707} 705}
708 706
709static inline int rt_is_expired(struct rtable *rth) 707static inline int rt_is_expired(struct rtable *rth)
@@ -902,6 +900,12 @@ void rt_cache_flush(struct net *net, int delay)
902 rt_do_flush(!in_softirq()); 900 rt_do_flush(!in_softirq());
903} 901}
904 902
903/* Flush previous cache invalidated entries from the cache */
904void rt_cache_flush_batch(void)
905{
906 rt_do_flush(!in_softirq());
907}
908
905/* 909/*
906 * We change rt_genid and let gc do the cleanup 910 * We change rt_genid and let gc do the cleanup
907 */ 911 */
@@ -1346,9 +1350,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1346 return; 1350 return;
1347 1351
1348 net = dev_net(dev); 1352 net = dev_net(dev);
1349 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 1353 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1350 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) 1354 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1351 || ipv4_is_zeronet(new_gw)) 1355 ipv4_is_zeronet(new_gw))
1352 goto reject_redirect; 1356 goto reject_redirect;
1353 1357
1354 if (!rt_caching(net)) 1358 if (!rt_caching(net))
@@ -1628,9 +1632,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1628 __be32 daddr = iph->daddr; 1632 __be32 daddr = iph->daddr;
1629 unsigned short est_mtu = 0; 1633 unsigned short est_mtu = 0;
1630 1634
1631 if (ipv4_config.no_pmtu_disc)
1632 return 0;
1633
1634 for (k = 0; k < 2; k++) { 1635 for (k = 0; k < 2; k++) {
1635 for (i = 0; i < 2; i++) { 1636 for (i = 0; i < 2; i++) {
1636 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1637 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
@@ -2314,10 +2315,11 @@ skip_cache:
2314 ip_hdr(skb)->protocol); 2315 ip_hdr(skb)->protocol);
2315 if (our 2316 if (our
2316#ifdef CONFIG_IP_MROUTE 2317#ifdef CONFIG_IP_MROUTE
2317 || (!ipv4_is_local_multicast(daddr) && 2318 ||
2318 IN_DEV_MFORWARD(in_dev)) 2319 (!ipv4_is_local_multicast(daddr) &&
2320 IN_DEV_MFORWARD(in_dev))
2319#endif 2321#endif
2320 ) { 2322 ) {
2321 rcu_read_unlock(); 2323 rcu_read_unlock();
2322 return ip_route_input_mc(skb, daddr, saddr, 2324 return ip_route_input_mc(skb, daddr, saddr,
2323 tos, dev, our); 2325 tos, dev, our);
@@ -2514,9 +2516,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2514 of another iface. --ANK 2516 of another iface. --ANK
2515 */ 2517 */
2516 2518
2517 if (oldflp->oif == 0 2519 if (oldflp->oif == 0 &&
2518 && (ipv4_is_multicast(oldflp->fl4_dst) || 2520 (ipv4_is_multicast(oldflp->fl4_dst) ||
2519 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2521 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2520 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2522 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2521 dev_out = ip_dev_find(net, oldflp->fl4_src); 2523 dev_out = ip_dev_find(net, oldflp->fl4_src);
2522 if (dev_out == NULL) 2524 if (dev_out == NULL)
@@ -2855,7 +2857,7 @@ static int rt_fill_info(struct net *net,
2855 error = rt->u.dst.error; 2857 error = rt->u.dst.error;
2856 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2858 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2857 if (rt->peer) { 2859 if (rt->peer) {
2858 id = rt->peer->ip_id_count; 2860 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2859 if (rt->peer->tcp_ts_stamp) { 2861 if (rt->peer->tcp_ts_stamp) {
2860 ts = rt->peer->tcp_ts; 2862 ts = rt->peer->tcp_ts;
2861 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2863 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
@@ -3257,7 +3259,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
3257 struct ctl_table *tbl; 3259 struct ctl_table *tbl;
3258 3260
3259 tbl = ipv4_route_flush_table; 3261 tbl = ipv4_route_flush_table;
3260 if (net != &init_net) { 3262 if (!net_eq(net, &init_net)) {
3261 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3263 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3262 if (tbl == NULL) 3264 if (tbl == NULL)
3263 goto err_dup; 3265 goto err_dup;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e077ac33..26399ad2a289 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -253,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp);
253struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 253struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
254 struct ip_options *opt) 254 struct ip_options *opt)
255{ 255{
256 struct tcp_options_received tcp_opt;
257 u8 *hash_location;
256 struct inet_request_sock *ireq; 258 struct inet_request_sock *ireq;
257 struct tcp_request_sock *treq; 259 struct tcp_request_sock *treq;
258 struct tcp_sock *tp = tcp_sk(sk); 260 struct tcp_sock *tp = tcp_sk(sk);
@@ -263,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
263 int mss; 265 int mss;
264 struct rtable *rt; 266 struct rtable *rt;
265 __u8 rcv_wscale; 267 __u8 rcv_wscale;
266 struct tcp_options_received tcp_opt;
267 268
268 if (!sysctl_tcp_syncookies || !th->ack) 269 if (!sysctl_tcp_syncookies || !th->ack)
269 goto out; 270 goto out;
@@ -276,13 +277,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
276 277
277 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); 278 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
278 279
279 /* check for timestamp cookie support */
280 memset(&tcp_opt, 0, sizeof(tcp_opt));
281 tcp_parse_options(skb, &tcp_opt, 0);
282
283 if (tcp_opt.saw_tstamp)
284 cookie_check_timestamp(&tcp_opt);
285
286 ret = NULL; 280 ret = NULL;
287 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ 281 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
288 if (!req) 282 if (!req)
@@ -298,12 +292,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
298 ireq->loc_addr = ip_hdr(skb)->daddr; 292 ireq->loc_addr = ip_hdr(skb)->daddr;
299 ireq->rmt_addr = ip_hdr(skb)->saddr; 293 ireq->rmt_addr = ip_hdr(skb)->saddr;
300 ireq->ecn_ok = 0; 294 ireq->ecn_ok = 0;
301 ireq->snd_wscale = tcp_opt.snd_wscale;
302 ireq->rcv_wscale = tcp_opt.rcv_wscale;
303 ireq->sack_ok = tcp_opt.sack_ok;
304 ireq->wscale_ok = tcp_opt.wscale_ok;
305 ireq->tstamp_ok = tcp_opt.saw_tstamp;
306 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
307 295
308 /* We throwed the options of the initial SYN away, so we hope 296 /* We throwed the options of the initial SYN away, so we hope
309 * the ACK carries the same options again (see RFC1122 4.2.3.8) 297 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -333,7 +321,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
333 * no easy way to do this. 321 * no easy way to do this.
334 */ 322 */
335 { 323 {
336 struct flowi fl = { .nl_u = { .ip4_u = 324 struct flowi fl = { .mark = sk->sk_mark,
325 .nl_u = { .ip4_u =
337 { .daddr = ((opt && opt->srr) ? 326 { .daddr = ((opt && opt->srr) ?
338 opt->faddr : 327 opt->faddr :
339 ireq->rmt_addr), 328 ireq->rmt_addr),
@@ -351,6 +340,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
351 } 340 }
352 } 341 }
353 342
343 /* check for timestamp cookie support */
344 memset(&tcp_opt, 0, sizeof(tcp_opt));
345 tcp_parse_options(skb, &tcp_opt, &hash_location, 0, &rt->u.dst);
346
347 if (tcp_opt.saw_tstamp)
348 cookie_check_timestamp(&tcp_opt);
349
350 ireq->snd_wscale = tcp_opt.snd_wscale;
351 ireq->rcv_wscale = tcp_opt.rcv_wscale;
352 ireq->sack_ok = tcp_opt.sack_ok;
353 ireq->wscale_ok = tcp_opt.wscale_ok;
354 ireq->tstamp_ok = tcp_opt.saw_tstamp;
355 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
356
354 /* Try to redo what tcp_v4_send_synack did. */ 357 /* Try to redo what tcp_v4_send_synack did. */
355 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); 358 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
356 359
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 300056732953..7e3712ce3994 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -569,6 +569,13 @@ static struct ctl_table ipv4_table[] = {
569 .proc_handler = proc_dointvec, 569 .proc_handler = proc_dointvec,
570 }, 570 },
571 { 571 {
572 .procname = "tcp_cookie_size",
573 .data = &sysctl_tcp_cookie_size,
574 .maxlen = sizeof(int),
575 .mode = 0644,
576 .proc_handler = proc_dointvec
577 },
578 {
572 .procname = "udp_mem", 579 .procname = "udp_mem",
573 .data = &sysctl_udp_mem, 580 .data = &sysctl_udp_mem,
574 .maxlen = sizeof(sysctl_udp_mem), 581 .maxlen = sizeof(sysctl_udp_mem),
@@ -660,7 +667,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
660 struct ctl_table *table; 667 struct ctl_table *table;
661 668
662 table = ipv4_net_table; 669 table = ipv4_net_table;
663 if (net != &init_net) { 670 if (!net_eq(net, &init_net)) {
664 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); 671 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
665 if (table == NULL) 672 if (table == NULL)
666 goto err_alloc; 673 goto err_alloc;
@@ -691,7 +698,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
691 return 0; 698 return 0;
692 699
693err_reg: 700err_reg:
694 if (net != &init_net) 701 if (!net_eq(net, &init_net))
695 kfree(table); 702 kfree(table);
696err_alloc: 703err_alloc:
697 return -ENOMEM; 704 return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f1813bc71088..c8666b70cde0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,7 @@
264#include <linux/cache.h> 264#include <linux/cache.h>
265#include <linux/err.h> 265#include <linux/err.h>
266#include <linux/crypto.h> 266#include <linux/crypto.h>
267#include <linux/time.h>
267 268
268#include <net/icmp.h> 269#include <net/icmp.h>
269#include <net/tcp.h> 270#include <net/tcp.h>
@@ -2042,7 +2043,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2042 __skb_queue_purge(&sk->sk_async_wait_queue); 2043 __skb_queue_purge(&sk->sk_async_wait_queue);
2043#endif 2044#endif
2044 2045
2045 inet->dport = 0; 2046 inet->inet_dport = 0;
2046 2047
2047 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 2048 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2048 inet_reset_saddr(sk); 2049 inet_reset_saddr(sk);
@@ -2059,6 +2060,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2059 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 2060 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2060 tp->snd_cwnd_cnt = 0; 2061 tp->snd_cwnd_cnt = 0;
2061 tp->bytes_acked = 0; 2062 tp->bytes_acked = 0;
2063 tp->window_clamp = 0;
2062 tcp_set_ca_state(sk, TCP_CA_Open); 2064 tcp_set_ca_state(sk, TCP_CA_Open);
2063 tcp_clear_retrans(tp); 2065 tcp_clear_retrans(tp);
2064 inet_csk_delack_init(sk); 2066 inet_csk_delack_init(sk);
@@ -2066,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2066 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2068 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2067 __sk_dst_reset(sk); 2069 __sk_dst_reset(sk);
2068 2070
2069 WARN_ON(inet->num && !icsk->icsk_bind_hash); 2071 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2070 2072
2071 sk->sk_error_report(sk); 2073 sk->sk_error_report(sk);
2072 return err; 2074 return err;
@@ -2083,8 +2085,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2083 int val; 2085 int val;
2084 int err = 0; 2086 int err = 0;
2085 2087
2086 /* This is a string value all the others are int's */ 2088 /* These are data/string values, all the others are ints */
2087 if (optname == TCP_CONGESTION) { 2089 switch (optname) {
2090 case TCP_CONGESTION: {
2088 char name[TCP_CA_NAME_MAX]; 2091 char name[TCP_CA_NAME_MAX];
2089 2092
2090 if (optlen < 1) 2093 if (optlen < 1)
@@ -2101,6 +2104,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2101 release_sock(sk); 2104 release_sock(sk);
2102 return err; 2105 return err;
2103 } 2106 }
2107 case TCP_COOKIE_TRANSACTIONS: {
2108 struct tcp_cookie_transactions ctd;
2109 struct tcp_cookie_values *cvp = NULL;
2110
2111 if (sizeof(ctd) > optlen)
2112 return -EINVAL;
2113 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2114 return -EFAULT;
2115
2116 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2117 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2118 return -EINVAL;
2119
2120 if (ctd.tcpct_cookie_desired == 0) {
2121 /* default to global value */
2122 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2123 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2124 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2125 return -EINVAL;
2126 }
2127
2128 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2129 /* Supercedes all other values */
2130 lock_sock(sk);
2131 if (tp->cookie_values != NULL) {
2132 kref_put(&tp->cookie_values->kref,
2133 tcp_cookie_values_release);
2134 tp->cookie_values = NULL;
2135 }
2136 tp->rx_opt.cookie_in_always = 0; /* false */
2137 tp->rx_opt.cookie_out_never = 1; /* true */
2138 release_sock(sk);
2139 return err;
2140 }
2141
2142 /* Allocate ancillary memory before locking.
2143 */
2144 if (ctd.tcpct_used > 0 ||
2145 (tp->cookie_values == NULL &&
2146 (sysctl_tcp_cookie_size > 0 ||
2147 ctd.tcpct_cookie_desired > 0 ||
2148 ctd.tcpct_s_data_desired > 0))) {
2149 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2150 GFP_KERNEL);
2151 if (cvp == NULL)
2152 return -ENOMEM;
2153 }
2154 lock_sock(sk);
2155 tp->rx_opt.cookie_in_always =
2156 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2157 tp->rx_opt.cookie_out_never = 0; /* false */
2158
2159 if (tp->cookie_values != NULL) {
2160 if (cvp != NULL) {
2161 /* Changed values are recorded by a changed
2162 * pointer, ensuring the cookie will differ,
2163 * without separately hashing each value later.
2164 */
2165 kref_put(&tp->cookie_values->kref,
2166 tcp_cookie_values_release);
2167 kref_init(&cvp->kref);
2168 tp->cookie_values = cvp;
2169 } else {
2170 cvp = tp->cookie_values;
2171 }
2172 }
2173 if (cvp != NULL) {
2174 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2175
2176 if (ctd.tcpct_used > 0) {
2177 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2178 ctd.tcpct_used);
2179 cvp->s_data_desired = ctd.tcpct_used;
2180 cvp->s_data_constant = 1; /* true */
2181 } else {
2182 /* No constant payload data. */
2183 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2184 cvp->s_data_constant = 0; /* false */
2185 }
2186 }
2187 release_sock(sk);
2188 return err;
2189 }
2190 default:
2191 /* fallthru */
2192 break;
2193 };
2104 2194
2105 if (optlen < sizeof(int)) 2195 if (optlen < sizeof(int))
2106 return -EINVAL; 2196 return -EINVAL;
@@ -2425,6 +2515,47 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2425 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) 2515 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2426 return -EFAULT; 2516 return -EFAULT;
2427 return 0; 2517 return 0;
2518
2519 case TCP_COOKIE_TRANSACTIONS: {
2520 struct tcp_cookie_transactions ctd;
2521 struct tcp_cookie_values *cvp = tp->cookie_values;
2522
2523 if (get_user(len, optlen))
2524 return -EFAULT;
2525 if (len < sizeof(ctd))
2526 return -EINVAL;
2527
2528 memset(&ctd, 0, sizeof(ctd));
2529 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2530 TCP_COOKIE_IN_ALWAYS : 0)
2531 | (tp->rx_opt.cookie_out_never ?
2532 TCP_COOKIE_OUT_NEVER : 0);
2533
2534 if (cvp != NULL) {
2535 ctd.tcpct_flags |= (cvp->s_data_in ?
2536 TCP_S_DATA_IN : 0)
2537 | (cvp->s_data_out ?
2538 TCP_S_DATA_OUT : 0);
2539
2540 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2541 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2542
2543 /* Cookie(s) saved, return as nonce */
2544 if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) {
2545 /* impossible? */
2546 return -EINVAL;
2547 }
2548 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2549 cvp->cookie_pair_size);
2550 ctd.tcpct_used = cvp->cookie_pair_size;
2551 }
2552
2553 if (put_user(sizeof(ctd), optlen))
2554 return -EFAULT;
2555 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2556 return -EFAULT;
2557 return 0;
2558 }
2428 default: 2559 default:
2429 return -ENOPROTOOPT; 2560 return -ENOPROTOOPT;
2430 } 2561 }
@@ -2847,6 +2978,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
2847 2978
2848#endif 2979#endif
2849 2980
2981/**
2982 * Each Responder maintains up to two secret values concurrently for
2983 * efficient secret rollover. Each secret value has 4 states:
2984 *
2985 * Generating. (tcp_secret_generating != tcp_secret_primary)
2986 * Generates new Responder-Cookies, but not yet used for primary
2987 * verification. This is a short-term state, typically lasting only
2988 * one round trip time (RTT).
2989 *
2990 * Primary. (tcp_secret_generating == tcp_secret_primary)
2991 * Used both for generation and primary verification.
2992 *
2993 * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
2994 * Used for verification, until the first failure that can be
2995 * verified by the newer Generating secret. At that time, this
2996 * cookie's state is changed to Secondary, and the Generating
2997 * cookie's state is changed to Primary. This is a short-term state,
2998 * typically lasting only one round trip time (RTT).
2999 *
3000 * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
3001 * Used for secondary verification, after primary verification
3002 * failures. This state lasts no more than twice the Maximum Segment
3003 * Lifetime (2MSL). Then, the secret is discarded.
3004 */
3005struct tcp_cookie_secret {
3006 /* The secret is divided into two parts. The digest part is the
3007 * equivalent of previously hashing a secret and saving the state,
3008 * and serves as an initialization vector (IV). The message part
3009 * serves as the trailing secret.
3010 */
3011 u32 secrets[COOKIE_WORKSPACE_WORDS];
3012 unsigned long expires;
3013};
3014
3015#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3016#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3017#define TCP_SECRET_LIFE (HZ * 600)
3018
3019static struct tcp_cookie_secret tcp_secret_one;
3020static struct tcp_cookie_secret tcp_secret_two;
3021
3022/* Essentially a circular list, without dynamic allocation. */
3023static struct tcp_cookie_secret *tcp_secret_generating;
3024static struct tcp_cookie_secret *tcp_secret_primary;
3025static struct tcp_cookie_secret *tcp_secret_retiring;
3026static struct tcp_cookie_secret *tcp_secret_secondary;
3027
3028static DEFINE_SPINLOCK(tcp_secret_locker);
3029
3030/* Select a pseudo-random word in the cookie workspace.
3031 */
3032static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3033{
3034 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3035}
3036
3037/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
3038 * Called in softirq context.
3039 * Returns: 0 for success.
3040 */
3041int tcp_cookie_generator(u32 *bakery)
3042{
3043 unsigned long jiffy = jiffies;
3044
3045 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3046 spin_lock_bh(&tcp_secret_locker);
3047 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3048 /* refreshed by another */
3049 memcpy(bakery,
3050 &tcp_secret_generating->secrets[0],
3051 COOKIE_WORKSPACE_WORDS);
3052 } else {
3053 /* still needs refreshing */
3054 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3055
3056 /* The first time, paranoia assumes that the
3057 * randomization function isn't as strong. But,
3058 * this secret initialization is delayed until
3059 * the last possible moment (packet arrival).
3060 * Although that time is observable, it is
3061 * unpredictably variable. Mash in the most
3062 * volatile clock bits available, and expire the
3063 * secret extra quickly.
3064 */
3065 if (unlikely(tcp_secret_primary->expires ==
3066 tcp_secret_secondary->expires)) {
3067 struct timespec tv;
3068
3069 getnstimeofday(&tv);
3070 bakery[COOKIE_DIGEST_WORDS+0] ^=
3071 (u32)tv.tv_nsec;
3072
3073 tcp_secret_secondary->expires = jiffy
3074 + TCP_SECRET_1MSL
3075 + (0x0f & tcp_cookie_work(bakery, 0));
3076 } else {
3077 tcp_secret_secondary->expires = jiffy
3078 + TCP_SECRET_LIFE
3079 + (0xff & tcp_cookie_work(bakery, 1));
3080 tcp_secret_primary->expires = jiffy
3081 + TCP_SECRET_2MSL
3082 + (0x1f & tcp_cookie_work(bakery, 2));
3083 }
3084 memcpy(&tcp_secret_secondary->secrets[0],
3085 bakery, COOKIE_WORKSPACE_WORDS);
3086
3087 rcu_assign_pointer(tcp_secret_generating,
3088 tcp_secret_secondary);
3089 rcu_assign_pointer(tcp_secret_retiring,
3090 tcp_secret_primary);
3091 /*
3092 * Neither call_rcu() nor synchronize_rcu() needed.
3093 * Retiring data is not freed. It is replaced after
3094 * further (locked) pointer updates, and a quiet time
3095 * (minimum 1MSL, maximum LIFE - 2MSL).
3096 */
3097 }
3098 spin_unlock_bh(&tcp_secret_locker);
3099 } else {
3100 rcu_read_lock_bh();
3101 memcpy(bakery,
3102 &rcu_dereference(tcp_secret_generating)->secrets[0],
3103 COOKIE_WORKSPACE_WORDS);
3104 rcu_read_unlock_bh();
3105 }
3106 return 0;
3107}
3108EXPORT_SYMBOL(tcp_cookie_generator);
3109
2850void tcp_done(struct sock *sk) 3110void tcp_done(struct sock *sk)
2851{ 3111{
2852 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3112 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2881,6 +3141,7 @@ void __init tcp_init(void)
2881 struct sk_buff *skb = NULL; 3141 struct sk_buff *skb = NULL;
2882 unsigned long nr_pages, limit; 3142 unsigned long nr_pages, limit;
2883 int order, i, max_share; 3143 int order, i, max_share;
3144 unsigned long jiffy = jiffies;
2884 3145
2885 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3146 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2886 3147
@@ -2903,11 +3164,10 @@ void __init tcp_init(void)
2903 (totalram_pages >= 128 * 1024) ? 3164 (totalram_pages >= 128 * 1024) ?
2904 13 : 15, 3165 13 : 15,
2905 0, 3166 0,
2906 &tcp_hashinfo.ehash_size,
2907 NULL, 3167 NULL,
3168 &tcp_hashinfo.ehash_mask,
2908 thash_entries ? 0 : 512 * 1024); 3169 thash_entries ? 0 : 512 * 1024);
2909 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 3170 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
2910 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2911 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3171 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2912 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 3172 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2913 } 3173 }
@@ -2916,7 +3176,7 @@ void __init tcp_init(void)
2916 tcp_hashinfo.bhash = 3176 tcp_hashinfo.bhash =
2917 alloc_large_system_hash("TCP bind", 3177 alloc_large_system_hash("TCP bind",
2918 sizeof(struct inet_bind_hashbucket), 3178 sizeof(struct inet_bind_hashbucket),
2919 tcp_hashinfo.ehash_size, 3179 tcp_hashinfo.ehash_mask + 1,
2920 (totalram_pages >= 128 * 1024) ? 3180 (totalram_pages >= 128 * 1024) ?
2921 13 : 15, 3181 13 : 15,
2922 0, 3182 0,
@@ -2971,10 +3231,19 @@ void __init tcp_init(void)
2971 sysctl_tcp_rmem[2] = max(87380, max_share); 3231 sysctl_tcp_rmem[2] = max(87380, max_share);
2972 3232
2973 printk(KERN_INFO "TCP: Hash tables configured " 3233 printk(KERN_INFO "TCP: Hash tables configured "
2974 "(established %d bind %d)\n", 3234 "(established %u bind %u)\n",
2975 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); 3235 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
2976 3236
2977 tcp_register_congestion_control(&tcp_reno); 3237 tcp_register_congestion_control(&tcp_reno);
3238
3239 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3240 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3241 tcp_secret_one.expires = jiffy; /* past due */
3242 tcp_secret_two.expires = jiffy; /* past due */
3243 tcp_secret_generating = &tcp_secret_one;
3244 tcp_secret_primary = &tcp_secret_one;
3245 tcp_secret_retiring = &tcp_secret_two;
3246 tcp_secret_secondary = &tcp_secret_two;
2978} 3247}
2979 3248
2980EXPORT_SYMBOL(tcp_close); 3249EXPORT_SYMBOL(tcp_close);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index fcbcd4ff6c5f..939edb3b8e4d 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -27,7 +27,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
27 r->idiag_rqueue = sk->sk_ack_backlog; 27 r->idiag_rqueue = sk->sk_ack_backlog;
28 r->idiag_wqueue = sk->sk_max_ack_backlog; 28 r->idiag_wqueue = sk->sk_max_ack_backlog;
29 } else { 29 } else {
30 r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; 30 r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
31 r->idiag_wqueue = tp->write_seq - tp->snd_una; 31 r->idiag_wqueue = tp->write_seq - tp->snd_una;
32 } 32 }
33 if (info != NULL) 33 if (info != NULL)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 26d5c7fc7de5..7c94a4955416 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
92 if (icsk->icsk_ca_state == TCP_CA_Open) { 92 if (icsk->icsk_ca_state == TCP_CA_Open) {
93 if (ca->maxRTT < ca->minRTT) 93 if (ca->maxRTT < ca->minRTT)
94 ca->maxRTT = ca->minRTT; 94 ca->maxRTT = ca->minRTT;
95 if (ca->maxRTT < srtt 95 if (ca->maxRTT < srtt &&
96 && srtt <= ca->maxRTT + msecs_to_jiffies(20)) 96 srtt <= ca->maxRTT + msecs_to_jiffies(20))
97 ca->maxRTT = srtt; 97 ca->maxRTT = srtt;
98 } 98 }
99} 99}
@@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
123 123
124 ca->packetcount += pkts_acked; 124 ca->packetcount += pkts_acked;
125 125
126 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) 126 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
127 && now - ca->lasttime >= ca->minRTT 127 now - ca->lasttime >= ca->minRTT &&
128 && ca->minRTT > 0) { 128 ca->minRTT > 0) {
129 __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); 129 __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
130 130
131 if (htcp_ccount(ca) <= 3) { 131 if (htcp_ccount(ca) <= 3) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784be7ab3..57ae96a04220 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
140 * "len" is invariant segment length, including TCP header. 140 * "len" is invariant segment length, including TCP header.
141 */ 141 */
142 len += skb->data - skb_transport_header(skb); 142 len += skb->data - skb_transport_header(skb);
143 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || 143 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
144 /* If PSH is not set, packet should be 144 /* If PSH is not set, packet should be
145 * full sized, provided peer TCP is not badly broken. 145 * full sized, provided peer TCP is not badly broken.
146 * This observation (if it is correct 8)) allows 146 * This observation (if it is correct 8)) allows
@@ -411,7 +411,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
411 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); 411 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
412 412
413 hint = min(hint, tp->rcv_wnd / 2); 413 hint = min(hint, tp->rcv_wnd / 2);
414 hint = min(hint, TCP_MIN_RCVMSS); 414 hint = min(hint, TCP_MSS_DEFAULT);
415 hint = max(hint, TCP_MIN_MSS); 415 hint = max(hint, TCP_MIN_MSS);
416 416
417 inet_csk(sk)->icsk_ack.rcv_mss = hint; 417 inet_csk(sk)->icsk_ack.rcv_mss = hint;
@@ -2300,7 +2300,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
2300 * they differ. Since neither occurs due to loss, TCP should really 2300 * they differ. Since neither occurs due to loss, TCP should really
2301 * ignore them. 2301 * ignore them.
2302 */ 2302 */
2303static inline int tcp_dupack_heurestics(struct tcp_sock *tp) 2303static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2304{ 2304{
2305 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2305 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2306} 2306}
@@ -2425,7 +2425,7 @@ static int tcp_time_to_recover(struct sock *sk)
2425 return 1; 2425 return 1;
2426 2426
2427 /* Not-A-Trick#2 : Classic rule... */ 2427 /* Not-A-Trick#2 : Classic rule... */
2428 if (tcp_dupack_heurestics(tp) > tp->reordering) 2428 if (tcp_dupack_heuristics(tp) > tp->reordering)
2429 return 1; 2429 return 1;
2430 2430
2431 /* Trick#3 : when we use RFC2988 timer restart, fast 2431 /* Trick#3 : when we use RFC2988 timer restart, fast
@@ -3698,7 +3698,7 @@ old_ack:
3698 * the fast version below fails. 3698 * the fast version below fails.
3699 */ 3699 */
3700void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, 3700void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3701 int estab) 3701 u8 **hvpp, int estab, struct dst_entry *dst)
3702{ 3702{
3703 unsigned char *ptr; 3703 unsigned char *ptr;
3704 struct tcphdr *th = tcp_hdr(skb); 3704 struct tcphdr *th = tcp_hdr(skb);
@@ -3737,7 +3737,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3737 break; 3737 break;
3738 case TCPOPT_WINDOW: 3738 case TCPOPT_WINDOW:
3739 if (opsize == TCPOLEN_WINDOW && th->syn && 3739 if (opsize == TCPOLEN_WINDOW && th->syn &&
3740 !estab && sysctl_tcp_window_scaling) { 3740 !estab && sysctl_tcp_window_scaling &&
3741 !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
3741 __u8 snd_wscale = *(__u8 *)ptr; 3742 __u8 snd_wscale = *(__u8 *)ptr;
3742 opt_rx->wscale_ok = 1; 3743 opt_rx->wscale_ok = 1;
3743 if (snd_wscale > 14) { 3744 if (snd_wscale > 14) {
@@ -3753,7 +3754,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3753 case TCPOPT_TIMESTAMP: 3754 case TCPOPT_TIMESTAMP:
3754 if ((opsize == TCPOLEN_TIMESTAMP) && 3755 if ((opsize == TCPOLEN_TIMESTAMP) &&
3755 ((estab && opt_rx->tstamp_ok) || 3756 ((estab && opt_rx->tstamp_ok) ||
3756 (!estab && sysctl_tcp_timestamps))) { 3757 (!estab && sysctl_tcp_timestamps &&
3758 !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
3757 opt_rx->saw_tstamp = 1; 3759 opt_rx->saw_tstamp = 1;
3758 opt_rx->rcv_tsval = get_unaligned_be32(ptr); 3760 opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3759 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); 3761 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
@@ -3761,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3761 break; 3763 break;
3762 case TCPOPT_SACK_PERM: 3764 case TCPOPT_SACK_PERM:
3763 if (opsize == TCPOLEN_SACK_PERM && th->syn && 3765 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3764 !estab && sysctl_tcp_sack) { 3766 !estab && sysctl_tcp_sack &&
3767 !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
3765 opt_rx->sack_ok = 1; 3768 opt_rx->sack_ok = 1;
3766 tcp_sack_reset(opt_rx); 3769 tcp_sack_reset(opt_rx);
3767 } 3770 }
@@ -3782,7 +3785,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3782 */ 3785 */
3783 break; 3786 break;
3784#endif 3787#endif
3785 } 3788 case TCPOPT_COOKIE:
3789 /* This option is variable length.
3790 */
3791 switch (opsize) {
3792 case TCPOLEN_COOKIE_BASE:
3793 /* not yet implemented */
3794 break;
3795 case TCPOLEN_COOKIE_PAIR:
3796 /* not yet implemented */
3797 break;
3798 case TCPOLEN_COOKIE_MIN+0:
3799 case TCPOLEN_COOKIE_MIN+2:
3800 case TCPOLEN_COOKIE_MIN+4:
3801 case TCPOLEN_COOKIE_MIN+6:
3802 case TCPOLEN_COOKIE_MAX:
3803 /* 16-bit multiple */
3804 opt_rx->cookie_plus = opsize;
3805 *hvpp = ptr;
3806 default:
3807 /* ignore option */
3808 break;
3809 };
3810 break;
3811 };
3786 3812
3787 ptr += opsize-2; 3813 ptr += opsize-2;
3788 length -= opsize; 3814 length -= opsize;
@@ -3810,17 +3836,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3810 * If it is wrong it falls back on tcp_parse_options(). 3836 * If it is wrong it falls back on tcp_parse_options().
3811 */ 3837 */
3812static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 3838static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3813 struct tcp_sock *tp) 3839 struct tcp_sock *tp, u8 **hvpp)
3814{ 3840{
3815 if (th->doff == sizeof(struct tcphdr) >> 2) { 3841 /* In the spirit of fast parsing, compare doff directly to constant
3842 * values. Because equality is used, short doff can be ignored here.
3843 */
3844 if (th->doff == (sizeof(*th) / 4)) {
3816 tp->rx_opt.saw_tstamp = 0; 3845 tp->rx_opt.saw_tstamp = 0;
3817 return 0; 3846 return 0;
3818 } else if (tp->rx_opt.tstamp_ok && 3847 } else if (tp->rx_opt.tstamp_ok &&
3819 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3848 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3820 if (tcp_parse_aligned_timestamp(tp, th)) 3849 if (tcp_parse_aligned_timestamp(tp, th))
3821 return 1; 3850 return 1;
3822 } 3851 }
3823 tcp_parse_options(skb, &tp->rx_opt, 1); 3852 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
3824 return 1; 3853 return 1;
3825} 3854}
3826 3855
@@ -4075,8 +4104,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4075static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) 4104static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4076{ 4105{
4077 struct tcp_sock *tp = tcp_sk(sk); 4106 struct tcp_sock *tp = tcp_sk(sk);
4107 struct dst_entry *dst = __sk_dst_get(sk);
4078 4108
4079 if (tcp_is_sack(tp) && sysctl_tcp_dsack) { 4109 if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
4110 !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
4080 int mib_idx; 4111 int mib_idx;
4081 4112
4082 if (before(seq, tp->rcv_nxt)) 4113 if (before(seq, tp->rcv_nxt))
@@ -4105,13 +4136,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4105static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) 4136static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
4106{ 4137{
4107 struct tcp_sock *tp = tcp_sk(sk); 4138 struct tcp_sock *tp = tcp_sk(sk);
4139 struct dst_entry *dst = __sk_dst_get(sk);
4108 4140
4109 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 4141 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4110 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 4142 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4111 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4143 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4112 tcp_enter_quickack_mode(sk); 4144 tcp_enter_quickack_mode(sk);
4113 4145
4114 if (tcp_is_sack(tp) && sysctl_tcp_dsack) { 4146 if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
4147 !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
4115 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 4148 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4116 4149
4117 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) 4150 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
@@ -4845,11 +4878,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4845 struct tcp_sock *tp = tcp_sk(sk); 4878 struct tcp_sock *tp = tcp_sk(sk);
4846 4879
4847 /* More than one full frame received... */ 4880 /* More than one full frame received... */
4848 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss 4881 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4849 /* ... and right edge of window advances far enough. 4882 /* ... and right edge of window advances far enough.
4850 * (tcp_recvmsg() will send ACK otherwise). Or... 4883 * (tcp_recvmsg() will send ACK otherwise). Or...
4851 */ 4884 */
4852 && __tcp_select_window(sk) >= tp->rcv_wnd) || 4885 __tcp_select_window(sk) >= tp->rcv_wnd) ||
4853 /* We ACK each frame or... */ 4886 /* We ACK each frame or... */
4854 tcp_in_quickack_mode(sk) || 4887 tcp_in_quickack_mode(sk) ||
4855 /* We have out of order data. */ 4888 /* We have out of order data. */
@@ -5070,10 +5103,12 @@ out:
5070static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5103static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5071 struct tcphdr *th, int syn_inerr) 5104 struct tcphdr *th, int syn_inerr)
5072{ 5105{
5106 u8 *hash_location;
5073 struct tcp_sock *tp = tcp_sk(sk); 5107 struct tcp_sock *tp = tcp_sk(sk);
5074 5108
5075 /* RFC1323: H1. Apply PAWS check first. */ 5109 /* RFC1323: H1. Apply PAWS check first. */
5076 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5110 if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
5111 tp->rx_opt.saw_tstamp &&
5077 tcp_paws_discard(sk, skb)) { 5112 tcp_paws_discard(sk, skb)) {
5078 if (!th->rst) { 5113 if (!th->rst) {
5079 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 5114 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5361,11 +5396,14 @@ discard:
5361static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5396static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5362 struct tcphdr *th, unsigned len) 5397 struct tcphdr *th, unsigned len)
5363{ 5398{
5364 struct tcp_sock *tp = tcp_sk(sk); 5399 u8 *hash_location;
5365 struct inet_connection_sock *icsk = inet_csk(sk); 5400 struct inet_connection_sock *icsk = inet_csk(sk);
5401 struct tcp_sock *tp = tcp_sk(sk);
5402 struct dst_entry *dst = __sk_dst_get(sk);
5403 struct tcp_cookie_values *cvp = tp->cookie_values;
5366 int saved_clamp = tp->rx_opt.mss_clamp; 5404 int saved_clamp = tp->rx_opt.mss_clamp;
5367 5405
5368 tcp_parse_options(skb, &tp->rx_opt, 0); 5406 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, dst);
5369 5407
5370 if (th->ack) { 5408 if (th->ack) {
5371 /* rfc793: 5409 /* rfc793:
@@ -5462,6 +5500,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5462 * Change state from SYN-SENT only after copied_seq 5500 * Change state from SYN-SENT only after copied_seq
5463 * is initialized. */ 5501 * is initialized. */
5464 tp->copied_seq = tp->rcv_nxt; 5502 tp->copied_seq = tp->rcv_nxt;
5503
5504 if (cvp != NULL &&
5505 cvp->cookie_pair_size > 0 &&
5506 tp->rx_opt.cookie_plus > 0) {
5507 int cookie_size = tp->rx_opt.cookie_plus
5508 - TCPOLEN_COOKIE_BASE;
5509 int cookie_pair_size = cookie_size
5510 + cvp->cookie_desired;
5511
5512 /* A cookie extension option was sent and returned.
5513 * Note that each incoming SYNACK replaces the
5514 * Responder cookie. The initial exchange is most
5515 * fragile, as protection against spoofing relies
5516 * entirely upon the sequence and timestamp (above).
5517 * This replacement strategy allows the correct pair to
5518 * pass through, while any others will be filtered via
5519 * Responder verification later.
5520 */
5521 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5522 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5523 hash_location, cookie_size);
5524 cvp->cookie_pair_size = cookie_pair_size;
5525 }
5526 }
5527
5465 smp_mb(); 5528 smp_mb();
5466 tcp_set_state(sk, TCP_ESTABLISHED); 5529 tcp_set_state(sk, TCP_ESTABLISHED);
5467 5530
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b53f61..29002ab26e0d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -165,10 +165,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
165 nexthop = inet->opt->faddr; 165 nexthop = inet->opt->faddr;
166 } 166 }
167 167
168 tmp = ip_route_connect(&rt, nexthop, inet->saddr, 168 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 169 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170 IPPROTO_TCP, 170 IPPROTO_TCP,
171 inet->sport, usin->sin_port, sk, 1); 171 inet->inet_sport, usin->sin_port, sk, 1);
172 if (tmp < 0) { 172 if (tmp < 0) {
173 if (tmp == -ENETUNREACH) 173 if (tmp == -ENETUNREACH)
174 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 174 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -183,11 +183,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
183 if (!inet->opt || !inet->opt->srr) 183 if (!inet->opt || !inet->opt->srr)
184 daddr = rt->rt_dst; 184 daddr = rt->rt_dst;
185 185
186 if (!inet->saddr) 186 if (!inet->inet_saddr)
187 inet->saddr = rt->rt_src; 187 inet->inet_saddr = rt->rt_src;
188 inet->rcv_saddr = inet->saddr; 188 inet->inet_rcv_saddr = inet->inet_saddr;
189 189
190 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { 190 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191 /* Reset inherited state */ 191 /* Reset inherited state */
192 tp->rx_opt.ts_recent = 0; 192 tp->rx_opt.ts_recent = 0;
193 tp->rx_opt.ts_recent_stamp = 0; 193 tp->rx_opt.ts_recent_stamp = 0;
@@ -204,20 +204,20 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
204 * when trying new connection. 204 * when trying new connection.
205 */ 205 */
206 if (peer != NULL && 206 if (peer != NULL &&
207 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { 207 (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 208 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209 tp->rx_opt.ts_recent = peer->tcp_ts; 209 tp->rx_opt.ts_recent = peer->tcp_ts;
210 } 210 }
211 } 211 }
212 212
213 inet->dport = usin->sin_port; 213 inet->inet_dport = usin->sin_port;
214 inet->daddr = daddr; 214 inet->inet_daddr = daddr;
215 215
216 inet_csk(sk)->icsk_ext_hdr_len = 0; 216 inet_csk(sk)->icsk_ext_hdr_len = 0;
217 if (inet->opt) 217 if (inet->opt)
218 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 218 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219 219
220 tp->rx_opt.mss_clamp = 536; 220 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221 221
222 /* Socket identity is still unknown (sport may be zero). 222 /* Socket identity is still unknown (sport may be zero).
223 * However we set state to SYN-SENT and not releasing socket 223 * However we set state to SYN-SENT and not releasing socket
@@ -230,7 +230,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
230 goto failure; 230 goto failure;
231 231
232 err = ip_route_newports(&rt, IPPROTO_TCP, 232 err = ip_route_newports(&rt, IPPROTO_TCP,
233 inet->sport, inet->dport, sk); 233 inet->inet_sport, inet->inet_dport, sk);
234 if (err) 234 if (err)
235 goto failure; 235 goto failure;
236 236
@@ -239,12 +239,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
239 sk_setup_caps(sk, &rt->u.dst); 239 sk_setup_caps(sk, &rt->u.dst);
240 240
241 if (!tp->write_seq) 241 if (!tp->write_seq)
242 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 242 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243 inet->daddr, 243 inet->inet_daddr,
244 inet->sport, 244 inet->inet_sport,
245 usin->sin_port); 245 usin->sin_port);
246 246
247 inet->id = tp->write_seq ^ jiffies; 247 inet->inet_id = tp->write_seq ^ jiffies;
248 248
249 err = tcp_connect(sk); 249 err = tcp_connect(sk);
250 rt = NULL; 250 rt = NULL;
@@ -261,7 +261,7 @@ failure:
261 tcp_set_state(sk, TCP_CLOSE); 261 tcp_set_state(sk, TCP_CLOSE);
262 ip_rt_put(rt); 262 ip_rt_put(rt);
263 sk->sk_route_caps = 0; 263 sk->sk_route_caps = 0;
264 inet->dport = 0; 264 inet->inet_dport = 0;
265 return err; 265 return err;
266} 266}
267 267
@@ -520,12 +520,13 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
520 struct tcphdr *th = tcp_hdr(skb); 520 struct tcphdr *th = tcp_hdr(skb);
521 521
522 if (skb->ip_summed == CHECKSUM_PARTIAL) { 522 if (skb->ip_summed == CHECKSUM_PARTIAL) {
523 th->check = ~tcp_v4_check(len, inet->saddr, 523 th->check = ~tcp_v4_check(len, inet->inet_saddr,
524 inet->daddr, 0); 524 inet->inet_daddr, 0);
525 skb->csum_start = skb_transport_header(skb) - skb->head; 525 skb->csum_start = skb_transport_header(skb) - skb->head;
526 skb->csum_offset = offsetof(struct tcphdr, check); 526 skb->csum_offset = offsetof(struct tcphdr, check);
527 } else { 527 } else {
528 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 528 th->check = tcp_v4_check(len, inet->inet_saddr,
529 inet->inet_daddr,
529 csum_partial(th, 530 csum_partial(th,
530 th->doff << 2, 531 th->doff << 2,
531 skb->csum)); 532 skb->csum));
@@ -741,8 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
741 * This still operates on a request_sock only, not on a big 742 * This still operates on a request_sock only, not on a big
742 * socket. 743 * socket.
743 */ 744 */
744static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, 745static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
745 struct dst_entry *dst) 746 struct request_sock *req,
747 struct request_values *rvp)
746{ 748{
747 const struct inet_request_sock *ireq = inet_rsk(req); 749 const struct inet_request_sock *ireq = inet_rsk(req);
748 int err = -1; 750 int err = -1;
@@ -752,7 +754,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
752 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 754 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
753 return -1; 755 return -1;
754 756
755 skb = tcp_make_synack(sk, dst, req); 757 skb = tcp_make_synack(sk, dst, req, rvp);
756 758
757 if (skb) { 759 if (skb) {
758 struct tcphdr *th = tcp_hdr(skb); 760 struct tcphdr *th = tcp_hdr(skb);
@@ -773,9 +775,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
773 return err; 775 return err;
774} 776}
775 777
776static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) 778static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
779 struct request_values *rvp)
777{ 780{
778 return __tcp_v4_send_synack(sk, req, NULL); 781 return __tcp_v4_send_synack(sk, NULL, req, rvp);
779} 782}
780 783
781/* 784/*
@@ -848,7 +851,7 @@ static struct tcp_md5sig_key *
848struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 851struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
849 struct sock *addr_sk) 852 struct sock *addr_sk)
850{ 853{
851 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); 854 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
852} 855}
853 856
854EXPORT_SYMBOL(tcp_v4_md5_lookup); 857EXPORT_SYMBOL(tcp_v4_md5_lookup);
@@ -923,7 +926,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add);
923static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 926static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
924 u8 *newkey, u8 newkeylen) 927 u8 *newkey, u8 newkeylen)
925{ 928{
926 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, 929 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
927 newkey, newkeylen); 930 newkey, newkeylen);
928} 931}
929 932
@@ -1089,8 +1092,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1089 __be32 saddr, daddr; 1092 __be32 saddr, daddr;
1090 1093
1091 if (sk) { 1094 if (sk) {
1092 saddr = inet_sk(sk)->saddr; 1095 saddr = inet_sk(sk)->inet_saddr;
1093 daddr = inet_sk(sk)->daddr; 1096 daddr = inet_sk(sk)->inet_daddr;
1094 } else if (req) { 1097 } else if (req) {
1095 saddr = inet_rsk(req)->loc_addr; 1098 saddr = inet_rsk(req)->loc_addr;
1096 daddr = inet_rsk(req)->rmt_addr; 1099 daddr = inet_rsk(req)->rmt_addr;
@@ -1210,13 +1213,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
1210 1213
1211int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1214int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1212{ 1215{
1213 struct inet_request_sock *ireq; 1216 struct tcp_extend_values tmp_ext;
1214 struct tcp_options_received tmp_opt; 1217 struct tcp_options_received tmp_opt;
1218 u8 *hash_location;
1215 struct request_sock *req; 1219 struct request_sock *req;
1220 struct inet_request_sock *ireq;
1221 struct tcp_sock *tp = tcp_sk(sk);
1222 struct dst_entry *dst = NULL;
1216 __be32 saddr = ip_hdr(skb)->saddr; 1223 __be32 saddr = ip_hdr(skb)->saddr;
1217 __be32 daddr = ip_hdr(skb)->daddr; 1224 __be32 daddr = ip_hdr(skb)->daddr;
1218 __u32 isn = TCP_SKB_CB(skb)->when; 1225 __u32 isn = TCP_SKB_CB(skb)->when;
1219 struct dst_entry *dst = NULL;
1220#ifdef CONFIG_SYN_COOKIES 1226#ifdef CONFIG_SYN_COOKIES
1221 int want_cookie = 0; 1227 int want_cookie = 0;
1222#else 1228#else
@@ -1256,27 +1262,65 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1256 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; 1262 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1257#endif 1263#endif
1258 1264
1265 ireq = inet_rsk(req);
1266 ireq->loc_addr = daddr;
1267 ireq->rmt_addr = saddr;
1268 ireq->no_srccheck = inet_sk(sk)->transparent;
1269 ireq->opt = tcp_v4_save_options(sk, skb);
1270
1271 dst = inet_csk_route_req(sk, req);
1272 if(!dst)
1273 goto drop_and_free;
1274
1259 tcp_clear_options(&tmp_opt); 1275 tcp_clear_options(&tmp_opt);
1260 tmp_opt.mss_clamp = 536; 1276 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1261 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; 1277 tmp_opt.user_mss = tp->rx_opt.user_mss;
1278 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst);
1279
1280 if (tmp_opt.cookie_plus > 0 &&
1281 tmp_opt.saw_tstamp &&
1282 !tp->rx_opt.cookie_out_never &&
1283 (sysctl_tcp_cookie_size > 0 ||
1284 (tp->cookie_values != NULL &&
1285 tp->cookie_values->cookie_desired > 0))) {
1286 u8 *c;
1287 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1288 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1289
1290 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1291 goto drop_and_release;
1292
1293 /* Secret recipe starts with IP addresses */
1294 *mess++ ^= daddr;
1295 *mess++ ^= saddr;
1262 1296
1263 tcp_parse_options(skb, &tmp_opt, 0); 1297 /* plus variable length Initiator Cookie */
1298 c = (u8 *)mess;
1299 while (l-- > 0)
1300 *c++ ^= *hash_location++;
1301
1302#ifdef CONFIG_SYN_COOKIES
1303 want_cookie = 0; /* not our kind of cookie */
1304#endif
1305 tmp_ext.cookie_out_never = 0; /* false */
1306 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1307 } else if (!tp->rx_opt.cookie_in_always) {
1308 /* redundant indications, but ensure initialization. */
1309 tmp_ext.cookie_out_never = 1; /* true */
1310 tmp_ext.cookie_plus = 0;
1311 } else {
1312 goto drop_and_release;
1313 }
1314 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1264 1315
1265 if (want_cookie && !tmp_opt.saw_tstamp) 1316 if (want_cookie && !tmp_opt.saw_tstamp)
1266 tcp_clear_options(&tmp_opt); 1317 tcp_clear_options(&tmp_opt);
1267 1318
1268 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1319 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1269
1270 tcp_openreq_init(req, &tmp_opt, skb); 1320 tcp_openreq_init(req, &tmp_opt, skb);
1271 1321
1272 ireq = inet_rsk(req);
1273 ireq->loc_addr = daddr;
1274 ireq->rmt_addr = saddr;
1275 ireq->no_srccheck = inet_sk(sk)->transparent;
1276 ireq->opt = tcp_v4_save_options(sk, skb);
1277
1278 if (security_inet_conn_request(sk, skb, req)) 1322 if (security_inet_conn_request(sk, skb, req))
1279 goto drop_and_free; 1323 goto drop_and_release;
1280 1324
1281 if (!want_cookie) 1325 if (!want_cookie)
1282 TCP_ECN_create_request(req, tcp_hdr(skb)); 1326 TCP_ECN_create_request(req, tcp_hdr(skb));
@@ -1301,10 +1345,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1301 */ 1345 */
1302 if (tmp_opt.saw_tstamp && 1346 if (tmp_opt.saw_tstamp &&
1303 tcp_death_row.sysctl_tw_recycle && 1347 tcp_death_row.sysctl_tw_recycle &&
1304 (dst = inet_csk_route_req(sk, req)) != NULL &&
1305 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1348 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1306 peer->v4daddr == saddr) { 1349 peer->v4daddr == saddr) {
1307 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && 1350 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1308 (s32)(peer->tcp_ts - req->ts_recent) > 1351 (s32)(peer->tcp_ts - req->ts_recent) >
1309 TCP_PAWS_WINDOW) { 1352 TCP_PAWS_WINDOW) {
1310 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1353 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1333,7 +1376,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1333 } 1376 }
1334 tcp_rsk(req)->snt_isn = isn; 1377 tcp_rsk(req)->snt_isn = isn;
1335 1378
1336 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) 1379 if (__tcp_v4_send_synack(sk, dst, req,
1380 (struct request_values *)&tmp_ext) ||
1381 want_cookie)
1337 goto drop_and_free; 1382 goto drop_and_free;
1338 1383
1339 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1384 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1380,9 +1425,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1380 newtp = tcp_sk(newsk); 1425 newtp = tcp_sk(newsk);
1381 newinet = inet_sk(newsk); 1426 newinet = inet_sk(newsk);
1382 ireq = inet_rsk(req); 1427 ireq = inet_rsk(req);
1383 newinet->daddr = ireq->rmt_addr; 1428 newinet->inet_daddr = ireq->rmt_addr;
1384 newinet->rcv_saddr = ireq->loc_addr; 1429 newinet->inet_rcv_saddr = ireq->loc_addr;
1385 newinet->saddr = ireq->loc_addr; 1430 newinet->inet_saddr = ireq->loc_addr;
1386 newinet->opt = ireq->opt; 1431 newinet->opt = ireq->opt;
1387 ireq->opt = NULL; 1432 ireq->opt = NULL;
1388 newinet->mc_index = inet_iif(skb); 1433 newinet->mc_index = inet_iif(skb);
@@ -1390,7 +1435,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1390 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1435 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1391 if (newinet->opt) 1436 if (newinet->opt)
1392 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1437 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1393 newinet->id = newtp->write_seq ^ jiffies; 1438 newinet->inet_id = newtp->write_seq ^ jiffies;
1394 1439
1395 tcp_mtup_init(newsk); 1440 tcp_mtup_init(newsk);
1396 tcp_sync_mss(newsk, dst_mtu(dst)); 1441 tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1403,7 +1448,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1403 1448
1404#ifdef CONFIG_TCP_MD5SIG 1449#ifdef CONFIG_TCP_MD5SIG
1405 /* Copy over the MD5 key from the original socket */ 1450 /* Copy over the MD5 key from the original socket */
1406 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { 1451 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1452 if (key != NULL) {
1407 /* 1453 /*
1408 * We're using one, so create a matching key 1454 * We're using one, so create a matching key
1409 * on the newsk structure. If we fail to get 1455 * on the newsk structure. If we fail to get
@@ -1412,7 +1458,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1412 */ 1458 */
1413 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); 1459 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1414 if (newkey != NULL) 1460 if (newkey != NULL)
1415 tcp_v4_md5_do_add(newsk, newinet->daddr, 1461 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1416 newkey, key->keylen); 1462 newkey, key->keylen);
1417 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1463 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1418 } 1464 }
@@ -1711,8 +1757,8 @@ int tcp_v4_remember_stamp(struct sock *sk)
1711 struct inet_peer *peer = NULL; 1757 struct inet_peer *peer = NULL;
1712 int release_it = 0; 1758 int release_it = 0;
1713 1759
1714 if (!rt || rt->rt_dst != inet->daddr) { 1760 if (!rt || rt->rt_dst != inet->inet_daddr) {
1715 peer = inet_getpeer(inet->daddr, 1); 1761 peer = inet_getpeer(inet->inet_daddr, 1);
1716 release_it = 1; 1762 release_it = 1;
1717 } else { 1763 } else {
1718 if (!rt->peer) 1764 if (!rt->peer)
@@ -1722,9 +1768,9 @@ int tcp_v4_remember_stamp(struct sock *sk)
1722 1768
1723 if (peer) { 1769 if (peer) {
1724 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || 1770 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1725 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1771 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1726 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { 1772 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1727 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; 1773 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1728 peer->tcp_ts = tp->rx_opt.ts_recent; 1774 peer->tcp_ts = tp->rx_opt.ts_recent;
1729 } 1775 }
1730 if (release_it) 1776 if (release_it)
@@ -1743,9 +1789,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1743 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 1789 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1744 1790
1745 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || 1791 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1746 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && 1792 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1747 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { 1793 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1748 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; 1794 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1749 peer->tcp_ts = tcptw->tw_ts_recent; 1795 peer->tcp_ts = tcptw->tw_ts_recent;
1750 } 1796 }
1751 inet_putpeer(peer); 1797 inet_putpeer(peer);
@@ -1810,7 +1856,7 @@ static int tcp_v4_init_sock(struct sock *sk)
1810 */ 1856 */
1811 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 1857 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1812 tp->snd_cwnd_clamp = ~0; 1858 tp->snd_cwnd_clamp = ~0;
1813 tp->mss_cache = 536; 1859 tp->mss_cache = TCP_MSS_DEFAULT;
1814 1860
1815 tp->reordering = sysctl_tcp_reordering; 1861 tp->reordering = sysctl_tcp_reordering;
1816 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1862 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1826,6 +1872,19 @@ static int tcp_v4_init_sock(struct sock *sk)
1826 tp->af_specific = &tcp_sock_ipv4_specific; 1872 tp->af_specific = &tcp_sock_ipv4_specific;
1827#endif 1873#endif
1828 1874
1875 /* TCP Cookie Transactions */
1876 if (sysctl_tcp_cookie_size > 0) {
1877 /* Default, cookies without s_data_payload. */
1878 tp->cookie_values =
1879 kzalloc(sizeof(*tp->cookie_values),
1880 sk->sk_allocation);
1881 if (tp->cookie_values != NULL)
1882 kref_init(&tp->cookie_values->kref);
1883 }
1884 /* Presumed zeroed, in order of appearance:
1885 * cookie_in_always, cookie_out_never,
1886 * s_data_constant, s_data_in, s_data_out
1887 */
1829 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1888 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1830 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1889 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1831 1890
@@ -1879,6 +1938,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
1879 sk->sk_sndmsg_page = NULL; 1938 sk->sk_sndmsg_page = NULL;
1880 } 1939 }
1881 1940
1941 /* TCP Cookie Transactions */
1942 if (tp->cookie_values != NULL) {
1943 kref_put(&tp->cookie_values->kref,
1944 tcp_cookie_values_release);
1945 tp->cookie_values = NULL;
1946 }
1947
1882 percpu_counter_dec(&tcp_sockets_allocated); 1948 percpu_counter_dec(&tcp_sockets_allocated);
1883} 1949}
1884 1950
@@ -2000,7 +2066,7 @@ static void *established_get_first(struct seq_file *seq)
2000 struct net *net = seq_file_net(seq); 2066 struct net *net = seq_file_net(seq);
2001 void *rc = NULL; 2067 void *rc = NULL;
2002 2068
2003 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 2069 for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2004 struct sock *sk; 2070 struct sock *sk;
2005 struct hlist_nulls_node *node; 2071 struct hlist_nulls_node *node;
2006 struct inet_timewait_sock *tw; 2072 struct inet_timewait_sock *tw;
@@ -2061,10 +2127,10 @@ get_tw:
2061 st->state = TCP_SEQ_STATE_ESTABLISHED; 2127 st->state = TCP_SEQ_STATE_ESTABLISHED;
2062 2128
2063 /* Look for next non empty bucket */ 2129 /* Look for next non empty bucket */
2064 while (++st->bucket < tcp_hashinfo.ehash_size && 2130 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2065 empty_bucket(st)) 2131 empty_bucket(st))
2066 ; 2132 ;
2067 if (st->bucket >= tcp_hashinfo.ehash_size) 2133 if (st->bucket > tcp_hashinfo.ehash_mask)
2068 return NULL; 2134 return NULL;
2069 2135
2070 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2136 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2225,7 +2291,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
2225 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", 2291 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2226 i, 2292 i,
2227 ireq->loc_addr, 2293 ireq->loc_addr,
2228 ntohs(inet_sk(sk)->sport), 2294 ntohs(inet_sk(sk)->inet_sport),
2229 ireq->rmt_addr, 2295 ireq->rmt_addr,
2230 ntohs(ireq->rmt_port), 2296 ntohs(ireq->rmt_port),
2231 TCP_SYN_RECV, 2297 TCP_SYN_RECV,
@@ -2248,10 +2314,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2248 struct tcp_sock *tp = tcp_sk(sk); 2314 struct tcp_sock *tp = tcp_sk(sk);
2249 const struct inet_connection_sock *icsk = inet_csk(sk); 2315 const struct inet_connection_sock *icsk = inet_csk(sk);
2250 struct inet_sock *inet = inet_sk(sk); 2316 struct inet_sock *inet = inet_sk(sk);
2251 __be32 dest = inet->daddr; 2317 __be32 dest = inet->inet_daddr;
2252 __be32 src = inet->rcv_saddr; 2318 __be32 src = inet->inet_rcv_saddr;
2253 __u16 destp = ntohs(inet->dport); 2319 __u16 destp = ntohs(inet->inet_dport);
2254 __u16 srcp = ntohs(inet->sport); 2320 __u16 srcp = ntohs(inet->inet_sport);
2321 int rx_queue;
2255 2322
2256 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2323 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2257 timer_active = 1; 2324 timer_active = 1;
@@ -2267,12 +2334,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2267 timer_expires = jiffies; 2334 timer_expires = jiffies;
2268 } 2335 }
2269 2336
2337 if (sk->sk_state == TCP_LISTEN)
2338 rx_queue = sk->sk_ack_backlog;
2339 else
2340 /*
2341 * because we dont lock socket, we might find a transient negative value
2342 */
2343 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2344
2270 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2345 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2271 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", 2346 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2272 i, src, srcp, dest, destp, sk->sk_state, 2347 i, src, srcp, dest, destp, sk->sk_state,
2273 tp->write_seq - tp->snd_una, 2348 tp->write_seq - tp->snd_una,
2274 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : 2349 rx_queue,
2275 (tp->rcv_nxt - tp->copied_seq),
2276 timer_active, 2350 timer_active,
2277 jiffies_to_clock_t(timer_expires - jiffies), 2351 jiffies_to_clock_t(timer_expires - jiffies),
2278 icsk->icsk_retransmits, 2352 icsk->icsk_retransmits,
@@ -2463,12 +2537,17 @@ static int __net_init tcp_sk_init(struct net *net)
2463static void __net_exit tcp_sk_exit(struct net *net) 2537static void __net_exit tcp_sk_exit(struct net *net)
2464{ 2538{
2465 inet_ctl_sock_destroy(net->ipv4.tcp_sock); 2539 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2466 inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); 2540}
2541
2542static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2543{
2544 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2467} 2545}
2468 2546
2469static struct pernet_operations __net_initdata tcp_sk_ops = { 2547static struct pernet_operations __net_initdata tcp_sk_ops = {
2470 .init = tcp_sk_init, 2548 .init = tcp_sk_init,
2471 .exit = tcp_sk_exit, 2549 .exit = tcp_sk_exit,
2550 .exit_batch = tcp_sk_exit_batch,
2472}; 2551};
2473 2552
2474void __init tcp_v4_init(void) 2553void __init tcp_v4_init(void)
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ce3c41ff50b2..de870377fbba 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
143 goto out; 143 goto out;
144 144
145 /* we can't calc remote HZ with no different!! */ 145 /* we can't calc remote HZ with no different!! */
146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time 146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
147 || tp->rx_opt.rcv_tsecr == lp->local_ref_time) 147 tp->rx_opt.rcv_tsecr == lp->local_ref_time)
148 goto out; 148 goto out;
149 149
150 m = HZ * (tp->rx_opt.rcv_tsval - 150 m = HZ * (tp->rx_opt.rcv_tsval -
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4c03598ed924..87accec8d097 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -26,13 +26,7 @@
26#include <net/inet_common.h> 26#include <net/inet_common.h>
27#include <net/xfrm.h> 27#include <net/xfrm.h>
28 28
29#ifdef CONFIG_SYSCTL 29int sysctl_tcp_syncookies __read_mostly = 1;
30#define SYNC_INIT 0 /* let the user enable it */
31#else
32#define SYNC_INIT 1
33#endif
34
35int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
36EXPORT_SYMBOL(sysctl_tcp_syncookies); 30EXPORT_SYMBOL(sysctl_tcp_syncookies);
37 31
38int sysctl_tcp_abort_on_overflow __read_mostly; 32int sysctl_tcp_abort_on_overflow __read_mostly;
@@ -96,13 +90,14 @@ enum tcp_tw_status
96tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 90tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
97 const struct tcphdr *th) 91 const struct tcphdr *th)
98{ 92{
99 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
100 struct tcp_options_received tmp_opt; 93 struct tcp_options_received tmp_opt;
94 u8 *hash_location;
95 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
101 int paws_reject = 0; 96 int paws_reject = 0;
102 97
103 tmp_opt.saw_tstamp = 0;
104 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 98 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
105 tcp_parse_options(skb, &tmp_opt, 0); 99 tmp_opt.tstamp_ok = 1;
100 tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL);
106 101
107 if (tmp_opt.saw_tstamp) { 102 if (tmp_opt.saw_tstamp) {
108 tmp_opt.ts_recent = tcptw->tw_ts_recent; 103 tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -389,14 +384,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
389 const struct inet_request_sock *ireq = inet_rsk(req); 384 const struct inet_request_sock *ireq = inet_rsk(req);
390 struct tcp_request_sock *treq = tcp_rsk(req); 385 struct tcp_request_sock *treq = tcp_rsk(req);
391 struct inet_connection_sock *newicsk = inet_csk(newsk); 386 struct inet_connection_sock *newicsk = inet_csk(newsk);
392 struct tcp_sock *newtp; 387 struct tcp_sock *newtp = tcp_sk(newsk);
388 struct tcp_sock *oldtp = tcp_sk(sk);
389 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
390
391 /* TCP Cookie Transactions require space for the cookie pair,
392 * as it differs for each connection. There is no need to
393 * copy any s_data_payload stored at the original socket.
394 * Failure will prevent resuming the connection.
395 *
396 * Presumed copied, in order of appearance:
397 * cookie_in_always, cookie_out_never
398 */
399 if (oldcvp != NULL) {
400 struct tcp_cookie_values *newcvp =
401 kzalloc(sizeof(*newtp->cookie_values),
402 GFP_ATOMIC);
403
404 if (newcvp != NULL) {
405 kref_init(&newcvp->kref);
406 newcvp->cookie_desired =
407 oldcvp->cookie_desired;
408 newtp->cookie_values = newcvp;
409 } else {
410 /* Not Yet Implemented */
411 newtp->cookie_values = NULL;
412 }
413 }
393 414
394 /* Now setup tcp_sock */ 415 /* Now setup tcp_sock */
395 newtp = tcp_sk(newsk);
396 newtp->pred_flags = 0; 416 newtp->pred_flags = 0;
397 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; 417
398 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; 418 newtp->rcv_wup = newtp->copied_seq =
399 newtp->snd_up = treq->snt_isn + 1; 419 newtp->rcv_nxt = treq->rcv_isn + 1;
420
421 newtp->snd_sml = newtp->snd_una =
422 newtp->snd_nxt = newtp->snd_up =
423 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
400 424
401 tcp_prequeue_init(newtp); 425 tcp_prequeue_init(newtp);
402 426
@@ -429,8 +453,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
429 tcp_set_ca_state(newsk, TCP_CA_Open); 453 tcp_set_ca_state(newsk, TCP_CA_Open);
430 tcp_init_xmit_timers(newsk); 454 tcp_init_xmit_timers(newsk);
431 skb_queue_head_init(&newtp->out_of_order_queue); 455 skb_queue_head_init(&newtp->out_of_order_queue);
432 newtp->write_seq = treq->snt_isn + 1; 456 newtp->write_seq = newtp->pushed_seq =
433 newtp->pushed_seq = newtp->write_seq; 457 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
434 458
435 newtp->rx_opt.saw_tstamp = 0; 459 newtp->rx_opt.saw_tstamp = 0;
436 460
@@ -476,7 +500,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
476 if (newtp->af_specific->md5_lookup(sk, newsk)) 500 if (newtp->af_specific->md5_lookup(sk, newsk))
477 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 501 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
478#endif 502#endif
479 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 503 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
480 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 504 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
481 newtp->rx_opt.mss_clamp = req->mss; 505 newtp->rx_opt.mss_clamp = req->mss;
482 TCP_ECN_openreq_child(newtp, req); 506 TCP_ECN_openreq_child(newtp, req);
@@ -495,15 +519,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
495 struct request_sock *req, 519 struct request_sock *req,
496 struct request_sock **prev) 520 struct request_sock **prev)
497{ 521{
522 struct tcp_options_received tmp_opt;
523 u8 *hash_location;
524 struct sock *child;
498 const struct tcphdr *th = tcp_hdr(skb); 525 const struct tcphdr *th = tcp_hdr(skb);
499 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 526 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
500 int paws_reject = 0; 527 int paws_reject = 0;
501 struct tcp_options_received tmp_opt;
502 struct sock *child;
503 528
504 tmp_opt.saw_tstamp = 0; 529 if ((th->doff > (sizeof(*th) >> 2)) && (req->ts_recent)) {
505 if (th->doff > (sizeof(struct tcphdr)>>2)) { 530 tmp_opt.tstamp_ok = 1;
506 tcp_parse_options(skb, &tmp_opt, 0); 531 tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL);
507 532
508 if (tmp_opt.saw_tstamp) { 533 if (tmp_opt.saw_tstamp) {
509 tmp_opt.ts_recent = req->ts_recent; 534 tmp_opt.ts_recent = req->ts_recent;
@@ -537,7 +562,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
537 * Enforce "SYN-ACK" according to figure 8, figure 6 562 * Enforce "SYN-ACK" according to figure 8, figure 6
538 * of RFC793, fixed by RFC1122. 563 * of RFC793, fixed by RFC1122.
539 */ 564 */
540 req->rsk_ops->rtx_syn_ack(sk, req); 565 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
541 return NULL; 566 return NULL;
542 } 567 }
543 568
@@ -596,7 +621,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
596 * Invalid ACK: reset will be sent by listening socket 621 * Invalid ACK: reset will be sent by listening socket
597 */ 622 */
598 if ((flg & TCP_FLAG_ACK) && 623 if ((flg & TCP_FLAG_ACK) &&
599 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) 624 (TCP_SKB_CB(skb)->ack_seq !=
625 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
600 return sk; 626 return sk;
601 627
602 /* Also, it would be not so bad idea to check rcv_tsecr, which 628 /* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a7080e..93316a96d820 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,10 @@ int sysctl_tcp_base_mss __read_mostly = 512;
59/* By default, RFC2861 behavior. */ 59/* By default, RFC2861 behavior. */
60int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 60int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
61 61
62int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
63EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
64
65
62/* Account for new data that has been sent to the network. */ 66/* Account for new data that has been sent to the network. */
63static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) 67static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
64{ 68{
@@ -362,15 +366,45 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
362#define OPTION_TS (1 << 1) 366#define OPTION_TS (1 << 1)
363#define OPTION_MD5 (1 << 2) 367#define OPTION_MD5 (1 << 2)
364#define OPTION_WSCALE (1 << 3) 368#define OPTION_WSCALE (1 << 3)
369#define OPTION_COOKIE_EXTENSION (1 << 4)
365 370
366struct tcp_out_options { 371struct tcp_out_options {
367 u8 options; /* bit field of OPTION_* */ 372 u8 options; /* bit field of OPTION_* */
368 u8 ws; /* window scale, 0 to disable */ 373 u8 ws; /* window scale, 0 to disable */
369 u8 num_sack_blocks; /* number of SACK blocks to include */ 374 u8 num_sack_blocks; /* number of SACK blocks to include */
375 u8 hash_size; /* bytes in hash_location */
370 u16 mss; /* 0 to disable */ 376 u16 mss; /* 0 to disable */
371 __u32 tsval, tsecr; /* need to include OPTION_TS */ 377 __u32 tsval, tsecr; /* need to include OPTION_TS */
378 __u8 *hash_location; /* temporary pointer, overloaded */
372}; 379};
373 380
381/* The sysctl int routines are generic, so check consistency here.
382 */
383static u8 tcp_cookie_size_check(u8 desired)
384{
385 if (desired > 0) {
386 /* previously specified */
387 return desired;
388 }
389 if (sysctl_tcp_cookie_size <= 0) {
390 /* no default specified */
391 return 0;
392 }
393 if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
394 /* value too small, specify minimum */
395 return TCP_COOKIE_MIN;
396 }
397 if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
398 /* value too large, specify maximum */
399 return TCP_COOKIE_MAX;
400 }
401 if (0x1 & sysctl_tcp_cookie_size) {
402 /* 8-bit multiple, illegal, fix it */
403 return (u8)(sysctl_tcp_cookie_size + 0x1);
404 }
405 return (u8)sysctl_tcp_cookie_size;
406}
407
374/* Write previously computed TCP options to the packet. 408/* Write previously computed TCP options to the packet.
375 * 409 *
376 * Beware: Something in the Internet is very sensitive to the ordering of 410 * Beware: Something in the Internet is very sensitive to the ordering of
@@ -385,17 +419,34 @@ struct tcp_out_options {
385 * (but it may well be that other scenarios fail similarly). 419 * (but it may well be that other scenarios fail similarly).
386 */ 420 */
387static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 421static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
388 const struct tcp_out_options *opts, 422 struct tcp_out_options *opts)
389 __u8 **md5_hash) { 423{
390 if (unlikely(OPTION_MD5 & opts->options)) { 424 u8 options = opts->options; /* mungable copy */
391 *ptr++ = htonl((TCPOPT_NOP << 24) | 425
392 (TCPOPT_NOP << 16) | 426 /* Having both authentication and cookies for security is redundant,
393 (TCPOPT_MD5SIG << 8) | 427 * and there's certainly not enough room. Instead, the cookie-less
394 TCPOLEN_MD5SIG); 428 * extension variant is proposed.
395 *md5_hash = (__u8 *)ptr; 429 *
430 * Consider the pessimal case with authentication. The options
431 * could look like:
432 * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
433 */
434 if (unlikely(OPTION_MD5 & options)) {
435 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
436 *ptr++ = htonl((TCPOPT_COOKIE << 24) |
437 (TCPOLEN_COOKIE_BASE << 16) |
438 (TCPOPT_MD5SIG << 8) |
439 TCPOLEN_MD5SIG);
440 } else {
441 *ptr++ = htonl((TCPOPT_NOP << 24) |
442 (TCPOPT_NOP << 16) |
443 (TCPOPT_MD5SIG << 8) |
444 TCPOLEN_MD5SIG);
445 }
446 options &= ~OPTION_COOKIE_EXTENSION;
447 /* overload cookie hash location */
448 opts->hash_location = (__u8 *)ptr;
396 ptr += 4; 449 ptr += 4;
397 } else {
398 *md5_hash = NULL;
399 } 450 }
400 451
401 if (unlikely(opts->mss)) { 452 if (unlikely(opts->mss)) {
@@ -404,12 +455,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
404 opts->mss); 455 opts->mss);
405 } 456 }
406 457
407 if (likely(OPTION_TS & opts->options)) { 458 if (likely(OPTION_TS & options)) {
408 if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { 459 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
409 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | 460 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
410 (TCPOLEN_SACK_PERM << 16) | 461 (TCPOLEN_SACK_PERM << 16) |
411 (TCPOPT_TIMESTAMP << 8) | 462 (TCPOPT_TIMESTAMP << 8) |
412 TCPOLEN_TIMESTAMP); 463 TCPOLEN_TIMESTAMP);
464 options &= ~OPTION_SACK_ADVERTISE;
413 } else { 465 } else {
414 *ptr++ = htonl((TCPOPT_NOP << 24) | 466 *ptr++ = htonl((TCPOPT_NOP << 24) |
415 (TCPOPT_NOP << 16) | 467 (TCPOPT_NOP << 16) |
@@ -420,15 +472,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
420 *ptr++ = htonl(opts->tsecr); 472 *ptr++ = htonl(opts->tsecr);
421 } 473 }
422 474
423 if (unlikely(OPTION_SACK_ADVERTISE & opts->options && 475 /* Specification requires after timestamp, so do it now.
424 !(OPTION_TS & opts->options))) { 476 *
477 * Consider the pessimal case without authentication. The options
478 * could look like:
479 * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
480 */
481 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
482 __u8 *cookie_copy = opts->hash_location;
483 u8 cookie_size = opts->hash_size;
484
485 /* 8-bit multiple handled in tcp_cookie_size_check() above,
486 * and elsewhere.
487 */
488 if (0x2 & cookie_size) {
489 __u8 *p = (__u8 *)ptr;
490
491 /* 16-bit multiple */
492 *p++ = TCPOPT_COOKIE;
493 *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
494 *p++ = *cookie_copy++;
495 *p++ = *cookie_copy++;
496 ptr++;
497 cookie_size -= 2;
498 } else {
499 /* 32-bit multiple */
500 *ptr++ = htonl(((TCPOPT_NOP << 24) |
501 (TCPOPT_NOP << 16) |
502 (TCPOPT_COOKIE << 8) |
503 TCPOLEN_COOKIE_BASE) +
504 cookie_size);
505 }
506
507 if (cookie_size > 0) {
508 memcpy(ptr, cookie_copy, cookie_size);
509 ptr += (cookie_size / 4);
510 }
511 }
512
513 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
425 *ptr++ = htonl((TCPOPT_NOP << 24) | 514 *ptr++ = htonl((TCPOPT_NOP << 24) |
426 (TCPOPT_NOP << 16) | 515 (TCPOPT_NOP << 16) |
427 (TCPOPT_SACK_PERM << 8) | 516 (TCPOPT_SACK_PERM << 8) |
428 TCPOLEN_SACK_PERM); 517 TCPOLEN_SACK_PERM);
429 } 518 }
430 519
431 if (unlikely(OPTION_WSCALE & opts->options)) { 520 if (unlikely(OPTION_WSCALE & options)) {
432 *ptr++ = htonl((TCPOPT_NOP << 24) | 521 *ptr++ = htonl((TCPOPT_NOP << 24) |
433 (TCPOPT_WINDOW << 16) | 522 (TCPOPT_WINDOW << 16) |
434 (TCPOLEN_WINDOW << 8) | 523 (TCPOLEN_WINDOW << 8) |
@@ -463,13 +552,18 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
463 struct tcp_out_options *opts, 552 struct tcp_out_options *opts,
464 struct tcp_md5sig_key **md5) { 553 struct tcp_md5sig_key **md5) {
465 struct tcp_sock *tp = tcp_sk(sk); 554 struct tcp_sock *tp = tcp_sk(sk);
466 unsigned size = 0; 555 struct tcp_cookie_values *cvp = tp->cookie_values;
556 struct dst_entry *dst = __sk_dst_get(sk);
557 unsigned remaining = MAX_TCP_OPTION_SPACE;
558 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
559 tcp_cookie_size_check(cvp->cookie_desired) :
560 0;
467 561
468#ifdef CONFIG_TCP_MD5SIG 562#ifdef CONFIG_TCP_MD5SIG
469 *md5 = tp->af_specific->md5_lookup(sk, sk); 563 *md5 = tp->af_specific->md5_lookup(sk, sk);
470 if (*md5) { 564 if (*md5) {
471 opts->options |= OPTION_MD5; 565 opts->options |= OPTION_MD5;
472 size += TCPOLEN_MD5SIG_ALIGNED; 566 remaining -= TCPOLEN_MD5SIG_ALIGNED;
473 } 567 }
474#else 568#else
475 *md5 = NULL; 569 *md5 = NULL;
@@ -485,26 +579,76 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
485 * SACKs don't matter, we never delay an ACK when we have any of those 579 * SACKs don't matter, we never delay an ACK when we have any of those
486 * going out. */ 580 * going out. */
487 opts->mss = tcp_advertise_mss(sk); 581 opts->mss = tcp_advertise_mss(sk);
488 size += TCPOLEN_MSS_ALIGNED; 582 remaining -= TCPOLEN_MSS_ALIGNED;
489 583
490 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 584 if (likely(sysctl_tcp_timestamps &&
585 !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
586 *md5 == NULL)) {
491 opts->options |= OPTION_TS; 587 opts->options |= OPTION_TS;
492 opts->tsval = TCP_SKB_CB(skb)->when; 588 opts->tsval = TCP_SKB_CB(skb)->when;
493 opts->tsecr = tp->rx_opt.ts_recent; 589 opts->tsecr = tp->rx_opt.ts_recent;
494 size += TCPOLEN_TSTAMP_ALIGNED; 590 remaining -= TCPOLEN_TSTAMP_ALIGNED;
495 } 591 }
496 if (likely(sysctl_tcp_window_scaling)) { 592 if (likely(sysctl_tcp_window_scaling &&
593 !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
497 opts->ws = tp->rx_opt.rcv_wscale; 594 opts->ws = tp->rx_opt.rcv_wscale;
498 opts->options |= OPTION_WSCALE; 595 opts->options |= OPTION_WSCALE;
499 size += TCPOLEN_WSCALE_ALIGNED; 596 remaining -= TCPOLEN_WSCALE_ALIGNED;
500 } 597 }
501 if (likely(sysctl_tcp_sack)) { 598 if (likely(sysctl_tcp_sack &&
599 !dst_feature(dst, RTAX_FEATURE_NO_SACK))) {
502 opts->options |= OPTION_SACK_ADVERTISE; 600 opts->options |= OPTION_SACK_ADVERTISE;
503 if (unlikely(!(OPTION_TS & opts->options))) 601 if (unlikely(!(OPTION_TS & opts->options)))
504 size += TCPOLEN_SACKPERM_ALIGNED; 602 remaining -= TCPOLEN_SACKPERM_ALIGNED;
505 } 603 }
506 604
507 return size; 605 /* Note that timestamps are required by the specification.
606 *
607 * Odd numbers of bytes are prohibited by the specification, ensuring
608 * that the cookie is 16-bit aligned, and the resulting cookie pair is
609 * 32-bit aligned.
610 */
611 if (*md5 == NULL &&
612 (OPTION_TS & opts->options) &&
613 cookie_size > 0) {
614 int need = TCPOLEN_COOKIE_BASE + cookie_size;
615
616 if (0x2 & need) {
617 /* 32-bit multiple */
618 need += 2; /* NOPs */
619
620 if (need > remaining) {
621 /* try shrinking cookie to fit */
622 cookie_size -= 2;
623 need -= 4;
624 }
625 }
626 while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
627 cookie_size -= 4;
628 need -= 4;
629 }
630 if (TCP_COOKIE_MIN <= cookie_size) {
631 opts->options |= OPTION_COOKIE_EXTENSION;
632 opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
633 opts->hash_size = cookie_size;
634
635 /* Remember for future incarnations. */
636 cvp->cookie_desired = cookie_size;
637
638 if (cvp->cookie_desired != cvp->cookie_pair_size) {
639 /* Currently use random bytes as a nonce,
640 * assuming these are completely unpredictable
641 * by hostile users of the same system.
642 */
643 get_random_bytes(&cvp->cookie_pair[0],
644 cookie_size);
645 cvp->cookie_pair_size = cookie_size;
646 }
647
648 remaining -= need;
649 }
650 }
651 return MAX_TCP_OPTION_SPACE - remaining;
508} 652}
509 653
510/* Set up TCP options for SYN-ACKs. */ 654/* Set up TCP options for SYN-ACKs. */
@@ -512,48 +656,77 @@ static unsigned tcp_synack_options(struct sock *sk,
512 struct request_sock *req, 656 struct request_sock *req,
513 unsigned mss, struct sk_buff *skb, 657 unsigned mss, struct sk_buff *skb,
514 struct tcp_out_options *opts, 658 struct tcp_out_options *opts,
515 struct tcp_md5sig_key **md5) { 659 struct tcp_md5sig_key **md5,
516 unsigned size = 0; 660 struct tcp_extend_values *xvp)
661{
517 struct inet_request_sock *ireq = inet_rsk(req); 662 struct inet_request_sock *ireq = inet_rsk(req);
518 char doing_ts; 663 unsigned remaining = MAX_TCP_OPTION_SPACE;
664 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
665 xvp->cookie_plus :
666 0;
667 bool doing_ts = ireq->tstamp_ok;
519 668
520#ifdef CONFIG_TCP_MD5SIG 669#ifdef CONFIG_TCP_MD5SIG
521 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 670 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
522 if (*md5) { 671 if (*md5) {
523 opts->options |= OPTION_MD5; 672 opts->options |= OPTION_MD5;
524 size += TCPOLEN_MD5SIG_ALIGNED; 673 remaining -= TCPOLEN_MD5SIG_ALIGNED;
674
675 /* We can't fit any SACK blocks in a packet with MD5 + TS
676 * options. There was discussion about disabling SACK
677 * rather than TS in order to fit in better with old,
678 * buggy kernels, but that was deemed to be unnecessary.
679 */
680 doing_ts &= !ireq->sack_ok;
525 } 681 }
526#else 682#else
527 *md5 = NULL; 683 *md5 = NULL;
528#endif 684#endif
529 685
530 /* we can't fit any SACK blocks in a packet with MD5 + TS 686 /* We always send an MSS option. */
531 options. There was discussion about disabling SACK rather than TS in
532 order to fit in better with old, buggy kernels, but that was deemed
533 to be unnecessary. */
534 doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
535
536 opts->mss = mss; 687 opts->mss = mss;
537 size += TCPOLEN_MSS_ALIGNED; 688 remaining -= TCPOLEN_MSS_ALIGNED;
538 689
539 if (likely(ireq->wscale_ok)) { 690 if (likely(ireq->wscale_ok)) {
540 opts->ws = ireq->rcv_wscale; 691 opts->ws = ireq->rcv_wscale;
541 opts->options |= OPTION_WSCALE; 692 opts->options |= OPTION_WSCALE;
542 size += TCPOLEN_WSCALE_ALIGNED; 693 remaining -= TCPOLEN_WSCALE_ALIGNED;
543 } 694 }
544 if (likely(doing_ts)) { 695 if (likely(doing_ts)) {
545 opts->options |= OPTION_TS; 696 opts->options |= OPTION_TS;
546 opts->tsval = TCP_SKB_CB(skb)->when; 697 opts->tsval = TCP_SKB_CB(skb)->when;
547 opts->tsecr = req->ts_recent; 698 opts->tsecr = req->ts_recent;
548 size += TCPOLEN_TSTAMP_ALIGNED; 699 remaining -= TCPOLEN_TSTAMP_ALIGNED;
549 } 700 }
550 if (likely(ireq->sack_ok)) { 701 if (likely(ireq->sack_ok)) {
551 opts->options |= OPTION_SACK_ADVERTISE; 702 opts->options |= OPTION_SACK_ADVERTISE;
552 if (unlikely(!doing_ts)) 703 if (unlikely(!doing_ts))
553 size += TCPOLEN_SACKPERM_ALIGNED; 704 remaining -= TCPOLEN_SACKPERM_ALIGNED;
554 } 705 }
555 706
556 return size; 707 /* Similar rationale to tcp_syn_options() applies here, too.
708 * If the <SYN> options fit, the same options should fit now!
709 */
710 if (*md5 == NULL &&
711 doing_ts &&
712 cookie_plus > TCPOLEN_COOKIE_BASE) {
713 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
714
715 if (0x2 & need) {
716 /* 32-bit multiple */
717 need += 2; /* NOPs */
718 }
719 if (need <= remaining) {
720 opts->options |= OPTION_COOKIE_EXTENSION;
721 opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
722 remaining -= need;
723 } else {
724 /* There's no error return, so flag it. */
725 xvp->cookie_out_never = 1; /* true */
726 opts->hash_size = 0;
727 }
728 }
729 return MAX_TCP_OPTION_SPACE - remaining;
557} 730}
558 731
559/* Compute TCP options for ESTABLISHED sockets. This is not the 732/* Compute TCP options for ESTABLISHED sockets. This is not the
@@ -619,7 +792,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
619 struct tcp_out_options opts; 792 struct tcp_out_options opts;
620 unsigned tcp_options_size, tcp_header_size; 793 unsigned tcp_options_size, tcp_header_size;
621 struct tcp_md5sig_key *md5; 794 struct tcp_md5sig_key *md5;
622 __u8 *md5_hash_location;
623 struct tcphdr *th; 795 struct tcphdr *th;
624 int err; 796 int err;
625 797
@@ -661,8 +833,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
661 833
662 /* Build TCP header and checksum it. */ 834 /* Build TCP header and checksum it. */
663 th = tcp_hdr(skb); 835 th = tcp_hdr(skb);
664 th->source = inet->sport; 836 th->source = inet->inet_sport;
665 th->dest = inet->dport; 837 th->dest = inet->inet_dport;
666 th->seq = htonl(tcb->seq); 838 th->seq = htonl(tcb->seq);
667 th->ack_seq = htonl(tp->rcv_nxt); 839 th->ack_seq = htonl(tp->rcv_nxt);
668 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 840 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
@@ -690,7 +862,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
690 } 862 }
691 } 863 }
692 864
693 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 865 tcp_options_write((__be32 *)(th + 1), tp, &opts);
694 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) 866 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
695 TCP_ECN_send(sk, skb, tcp_header_size); 867 TCP_ECN_send(sk, skb, tcp_header_size);
696 868
@@ -698,7 +870,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
698 /* Calculate the MD5 hash, as we have all we need now */ 870 /* Calculate the MD5 hash, as we have all we need now */
699 if (md5) { 871 if (md5) {
700 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 872 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
701 tp->af_specific->calc_md5_hash(md5_hash_location, 873 tp->af_specific->calc_md5_hash(opts.hash_location,
702 md5, sk, NULL, skb); 874 md5, sk, NULL, skb);
703 } 875 }
704#endif 876#endif
@@ -1918,8 +2090,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1918 * case, when window is shrunk to zero. In this case 2090 * case, when window is shrunk to zero. In this case
1919 * our retransmit serves as a zero window probe. 2091 * our retransmit serves as a zero window probe.
1920 */ 2092 */
1921 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) 2093 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
1922 && TCP_SKB_CB(skb)->seq != tp->snd_una) 2094 TCP_SKB_CB(skb)->seq != tp->snd_una)
1923 return -EAGAIN; 2095 return -EAGAIN;
1924 2096
1925 if (skb->len > cur_mss) { 2097 if (skb->len > cur_mss) {
@@ -2219,16 +2391,17 @@ int tcp_send_synack(struct sock *sk)
2219 2391
2220/* Prepare a SYN-ACK. */ 2392/* Prepare a SYN-ACK. */
2221struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2393struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2222 struct request_sock *req) 2394 struct request_sock *req,
2395 struct request_values *rvp)
2223{ 2396{
2397 struct tcp_out_options opts;
2398 struct tcp_extend_values *xvp = tcp_xv(rvp);
2224 struct inet_request_sock *ireq = inet_rsk(req); 2399 struct inet_request_sock *ireq = inet_rsk(req);
2225 struct tcp_sock *tp = tcp_sk(sk); 2400 struct tcp_sock *tp = tcp_sk(sk);
2226 struct tcphdr *th; 2401 struct tcphdr *th;
2227 int tcp_header_size;
2228 struct tcp_out_options opts;
2229 struct sk_buff *skb; 2402 struct sk_buff *skb;
2230 struct tcp_md5sig_key *md5; 2403 struct tcp_md5sig_key *md5;
2231 __u8 *md5_hash_location; 2404 int tcp_header_size;
2232 int mss; 2405 int mss;
2233 2406
2234 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2407 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
@@ -2266,8 +2439,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2266#endif 2439#endif
2267 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2440 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2268 tcp_header_size = tcp_synack_options(sk, req, mss, 2441 tcp_header_size = tcp_synack_options(sk, req, mss,
2269 skb, &opts, &md5) + 2442 skb, &opts, &md5, xvp)
2270 sizeof(struct tcphdr); 2443 + sizeof(*th);
2271 2444
2272 skb_push(skb, tcp_header_size); 2445 skb_push(skb, tcp_header_size);
2273 skb_reset_transport_header(skb); 2446 skb_reset_transport_header(skb);
@@ -2284,19 +2457,58 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2284 */ 2457 */
2285 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2458 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2286 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); 2459 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
2460
2461 if (OPTION_COOKIE_EXTENSION & opts.options) {
2462 const struct tcp_cookie_values *cvp = tp->cookie_values;
2463
2464 if (cvp != NULL &&
2465 cvp->s_data_constant &&
2466 cvp->s_data_desired > 0) {
2467 u8 *buf = skb_put(skb, cvp->s_data_desired);
2468
2469 /* copy data directly from the listening socket. */
2470 memcpy(buf, cvp->s_data_payload, cvp->s_data_desired);
2471 TCP_SKB_CB(skb)->end_seq += cvp->s_data_desired;
2472 }
2473
2474 if (opts.hash_size > 0) {
2475 __u32 workspace[SHA_WORKSPACE_WORDS];
2476 u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
2477 u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
2478
2479 /* Secret recipe depends on the Timestamp, (future)
2480 * Sequence and Acknowledgment Numbers, Initiator
2481 * Cookie, and others handled by IP variant caller.
2482 */
2483 *tail-- ^= opts.tsval;
2484 *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
2485 *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
2486
2487 /* recommended */
2488 *tail-- ^= ((th->dest << 16) | th->source);
2489 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
2490
2491 sha_transform((__u32 *)&xvp->cookie_bakery[0],
2492 (char *)mess,
2493 &workspace[0]);
2494 opts.hash_location =
2495 (__u8 *)&xvp->cookie_bakery[0];
2496 }
2497 }
2498
2287 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2499 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2288 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2500 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2289 2501
2290 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2502 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2291 th->window = htons(min(req->rcv_wnd, 65535U)); 2503 th->window = htons(min(req->rcv_wnd, 65535U));
2292 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 2504 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2293 th->doff = (tcp_header_size >> 2); 2505 th->doff = (tcp_header_size >> 2);
2294 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 2506 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
2295 2507
2296#ifdef CONFIG_TCP_MD5SIG 2508#ifdef CONFIG_TCP_MD5SIG
2297 /* Okay, we have all we need - do the md5 hash if needed */ 2509 /* Okay, we have all we need - do the md5 hash if needed */
2298 if (md5) { 2510 if (md5) {
2299 tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, 2511 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2300 md5, NULL, req, skb); 2512 md5, NULL, req, skb);
2301 } 2513 }
2302#endif 2514#endif
@@ -2315,7 +2527,9 @@ static void tcp_connect_init(struct sock *sk)
2315 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. 2527 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
2316 */ 2528 */
2317 tp->tcp_header_len = sizeof(struct tcphdr) + 2529 tp->tcp_header_len = sizeof(struct tcphdr) +
2318 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); 2530 (sysctl_tcp_timestamps &&
2531 (!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) ?
2532 TCPOLEN_TSTAMP_ALIGNED : 0));
2319 2533
2320#ifdef CONFIG_TCP_MD5SIG 2534#ifdef CONFIG_TCP_MD5SIG
2321 if (tp->af_specific->md5_lookup(sk, sk) != NULL) 2535 if (tp->af_specific->md5_lookup(sk, sk) != NULL)
@@ -2341,7 +2555,8 @@ static void tcp_connect_init(struct sock *sk)
2341 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2555 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2342 &tp->rcv_wnd, 2556 &tp->rcv_wnd,
2343 &tp->window_clamp, 2557 &tp->window_clamp,
2344 sysctl_tcp_window_scaling, 2558 (sysctl_tcp_window_scaling &&
2559 !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)),
2345 &rcv_wscale); 2560 &rcv_wscale);
2346 2561
2347 tp->rx_opt.rcv_wscale = rcv_wscale; 2562 tp->rx_opt.rcv_wscale = rcv_wscale;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 59f5b5e7c566..bb110c5ce1d2 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -94,8 +94,9 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
94 const struct inet_sock *inet = inet_sk(sk); 94 const struct inet_sock *inet = inet_sk(sk);
95 95
96 /* Only update if port matches */ 96 /* Only update if port matches */
97 if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) 97 if ((port == 0 || ntohs(inet->inet_dport) == port ||
98 && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { 98 ntohs(inet->inet_sport) == port) &&
99 (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
99 100
100 spin_lock(&tcp_probe.lock); 101 spin_lock(&tcp_probe.lock);
101 /* If log fills, just silently drop */ 102 /* If log fills, just silently drop */
@@ -103,10 +104,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
103 struct tcp_log *p = tcp_probe.log + tcp_probe.head; 104 struct tcp_log *p = tcp_probe.log + tcp_probe.head;
104 105
105 p->tstamp = ktime_get(); 106 p->tstamp = ktime_get();
106 p->saddr = inet->saddr; 107 p->saddr = inet->inet_saddr;
107 p->sport = inet->sport; 108 p->sport = inet->inet_sport;
108 p->daddr = inet->daddr; 109 p->daddr = inet->inet_daddr;
109 p->dport = inet->dport; 110 p->dport = inet->inet_dport;
110 p->length = skb->len; 111 p->length = skb->len;
111 p->snd_nxt = tp->snd_nxt; 112 p->snd_nxt = tp->snd_nxt;
112 p->snd_una = tp->snd_una; 113 p->snd_una = tp->snd_una;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index cdb2ca7684d4..8353a538cd4c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -141,14 +141,14 @@ static int tcp_write_timeout(struct sock *sk)
141 141
142 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 142 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
143 if (icsk->icsk_retransmits) 143 if (icsk->icsk_retransmits)
144 dst_negative_advice(&sk->sk_dst_cache); 144 dst_negative_advice(&sk->sk_dst_cache, sk);
145 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 145 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
146 } else { 146 } else {
147 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 147 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
148 /* Black hole detection */ 148 /* Black hole detection */
149 tcp_mtu_probing(icsk, sk); 149 tcp_mtu_probing(icsk, sk);
150 150
151 dst_negative_advice(&sk->sk_dst_cache); 151 dst_negative_advice(&sk->sk_dst_cache, sk);
152 } 152 }
153 153
154 retry_until = sysctl_tcp_retries2; 154 retry_until = sysctl_tcp_retries2;
@@ -303,15 +303,15 @@ void tcp_retransmit_timer(struct sock *sk)
303 struct inet_sock *inet = inet_sk(sk); 303 struct inet_sock *inet = inet_sk(sk);
304 if (sk->sk_family == AF_INET) { 304 if (sk->sk_family == AF_INET) {
305 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 305 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
306 &inet->daddr, ntohs(inet->dport), 306 &inet->inet_daddr, ntohs(inet->inet_dport),
307 inet->num, tp->snd_una, tp->snd_nxt); 307 inet->inet_num, tp->snd_una, tp->snd_nxt);
308 } 308 }
309#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 309#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
310 else if (sk->sk_family == AF_INET6) { 310 else if (sk->sk_family == AF_INET6) {
311 struct ipv6_pinfo *np = inet6_sk(sk); 311 struct ipv6_pinfo *np = inet6_sk(sk);
312 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 312 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
313 &np->daddr, ntohs(inet->dport), 313 &np->daddr, ntohs(inet->inet_dport),
314 inet->num, tp->snd_una, tp->snd_nxt); 314 inet->inet_num, tp->snd_una, tp->snd_nxt);
315 } 315 }
316#endif 316#endif
317#endif 317#endif
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index e9bbff746488..b612acf76183 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
165 * every other rtt. 165 * every other rtt.
166 */ 166 */
167 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 167 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
168 if (veno->inc 168 if (veno->inc &&
169 && tp->snd_cwnd < 169 tp->snd_cwnd < tp->snd_cwnd_clamp) {
170 tp->snd_cwnd_clamp) {
171 tp->snd_cwnd++; 170 tp->snd_cwnd++;
172 veno->inc = 0; 171 veno->inc = 0;
173 } else 172 } else
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 66b6821b984e..a0f240358892 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
157 157
158 if (queue > TCP_YEAH_ALPHA || 158 if (queue > TCP_YEAH_ALPHA ||
159 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { 159 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
160 if (queue > TCP_YEAH_ALPHA 160 if (queue > TCP_YEAH_ALPHA &&
161 && tp->snd_cwnd > yeah->reno_count) { 161 tp->snd_cwnd > yeah->reno_count) {
162 u32 reduction = min(queue / TCP_YEAH_GAMMA , 162 u32 reduction = min(queue / TCP_YEAH_GAMMA ,
163 tp->snd_cwnd >> TCP_YEAH_EPSILON); 163 tp->snd_cwnd >> TCP_YEAH_EPSILON);
164 164
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fa9f70e4b19..1f9534846ca9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -106,7 +106,7 @@
106#include <net/xfrm.h> 106#include <net/xfrm.h>
107#include "udp_impl.h" 107#include "udp_impl.h"
108 108
109struct udp_table udp_table; 109struct udp_table udp_table __read_mostly;
110EXPORT_SYMBOL(udp_table); 110EXPORT_SYMBOL(udp_table);
111 111
112int sysctl_udp_mem[3] __read_mostly; 112int sysctl_udp_mem[3] __read_mostly;
@@ -121,28 +121,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
121atomic_t udp_memory_allocated; 121atomic_t udp_memory_allocated;
122EXPORT_SYMBOL(udp_memory_allocated); 122EXPORT_SYMBOL(udp_memory_allocated);
123 123
124#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) 124#define MAX_UDP_PORTS 65536
125#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
125 126
126static int udp_lib_lport_inuse(struct net *net, __u16 num, 127static int udp_lib_lport_inuse(struct net *net, __u16 num,
127 const struct udp_hslot *hslot, 128 const struct udp_hslot *hslot,
128 unsigned long *bitmap, 129 unsigned long *bitmap,
129 struct sock *sk, 130 struct sock *sk,
130 int (*saddr_comp)(const struct sock *sk1, 131 int (*saddr_comp)(const struct sock *sk1,
131 const struct sock *sk2)) 132 const struct sock *sk2),
133 unsigned int log)
132{ 134{
133 struct sock *sk2; 135 struct sock *sk2;
134 struct hlist_nulls_node *node; 136 struct hlist_nulls_node *node;
135 137
136 sk_nulls_for_each(sk2, node, &hslot->head) 138 sk_nulls_for_each(sk2, node, &hslot->head)
137 if (net_eq(sock_net(sk2), net) && 139 if (net_eq(sock_net(sk2), net) &&
138 sk2 != sk && 140 sk2 != sk &&
139 (bitmap || sk2->sk_hash == num) && 141 (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
140 (!sk2->sk_reuse || !sk->sk_reuse) && 142 (!sk2->sk_reuse || !sk->sk_reuse) &&
141 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if 143 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
142 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 144 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
143 (*saddr_comp)(sk, sk2)) { 145 (*saddr_comp)(sk, sk2)) {
144 if (bitmap) 146 if (bitmap)
145 __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, 147 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
146 bitmap); 148 bitmap);
147 else 149 else
148 return 1; 150 return 1;
@@ -150,18 +152,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
150 return 0; 152 return 0;
151} 153}
152 154
155/*
156 * Note: we still hold spinlock of primary hash chain, so no other writer
157 * can insert/delete a socket with local_port == num
158 */
159static int udp_lib_lport_inuse2(struct net *net, __u16 num,
160 struct udp_hslot *hslot2,
161 struct sock *sk,
162 int (*saddr_comp)(const struct sock *sk1,
163 const struct sock *sk2))
164{
165 struct sock *sk2;
166 struct hlist_nulls_node *node;
167 int res = 0;
168
169 spin_lock(&hslot2->lock);
170 udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
171 if (net_eq(sock_net(sk2), net) &&
172 sk2 != sk &&
173 (udp_sk(sk2)->udp_port_hash == num) &&
174 (!sk2->sk_reuse || !sk->sk_reuse) &&
175 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
176 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
177 (*saddr_comp)(sk, sk2)) {
178 res = 1;
179 break;
180 }
181 spin_unlock(&hslot2->lock);
182 return res;
183}
184
153/** 185/**
154 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 186 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
155 * 187 *
156 * @sk: socket struct in question 188 * @sk: socket struct in question
157 * @snum: port number to look up 189 * @snum: port number to look up
158 * @saddr_comp: AF-dependent comparison of bound local IP addresses 190 * @saddr_comp: AF-dependent comparison of bound local IP addresses
191 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
192 * with NULL address
159 */ 193 */
160int udp_lib_get_port(struct sock *sk, unsigned short snum, 194int udp_lib_get_port(struct sock *sk, unsigned short snum,
161 int (*saddr_comp)(const struct sock *sk1, 195 int (*saddr_comp)(const struct sock *sk1,
162 const struct sock *sk2)) 196 const struct sock *sk2),
197 unsigned int hash2_nulladdr)
163{ 198{
164 struct udp_hslot *hslot; 199 struct udp_hslot *hslot, *hslot2;
165 struct udp_table *udptable = sk->sk_prot->h.udp_table; 200 struct udp_table *udptable = sk->sk_prot->h.udp_table;
166 int error = 1; 201 int error = 1;
167 struct net *net = sock_net(sk); 202 struct net *net = sock_net(sk);
@@ -180,13 +215,15 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
180 /* 215 /*
181 * force rand to be an odd multiple of UDP_HTABLE_SIZE 216 * force rand to be an odd multiple of UDP_HTABLE_SIZE
182 */ 217 */
183 rand = (rand | 1) * UDP_HTABLE_SIZE; 218 rand = (rand | 1) * (udptable->mask + 1);
184 for (last = first + UDP_HTABLE_SIZE; first != last; first++) { 219 for (last = first + udptable->mask + 1;
185 hslot = &udptable->hash[udp_hashfn(net, first)]; 220 first != last;
221 first++) {
222 hslot = udp_hashslot(udptable, net, first);
186 bitmap_zero(bitmap, PORTS_PER_CHAIN); 223 bitmap_zero(bitmap, PORTS_PER_CHAIN);
187 spin_lock_bh(&hslot->lock); 224 spin_lock_bh(&hslot->lock);
188 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 225 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
189 saddr_comp); 226 saddr_comp, udptable->log);
190 227
191 snum = first; 228 snum = first;
192 /* 229 /*
@@ -196,7 +233,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
196 */ 233 */
197 do { 234 do {
198 if (low <= snum && snum <= high && 235 if (low <= snum && snum <= high &&
199 !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) 236 !test_bit(snum >> udptable->log, bitmap))
200 goto found; 237 goto found;
201 snum += rand; 238 snum += rand;
202 } while (snum != first); 239 } while (snum != first);
@@ -204,17 +241,51 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
204 } 241 }
205 goto fail; 242 goto fail;
206 } else { 243 } else {
207 hslot = &udptable->hash[udp_hashfn(net, snum)]; 244 hslot = udp_hashslot(udptable, net, snum);
208 spin_lock_bh(&hslot->lock); 245 spin_lock_bh(&hslot->lock);
209 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) 246 if (hslot->count > 10) {
247 int exist;
248 unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
249
250 slot2 &= udptable->mask;
251 hash2_nulladdr &= udptable->mask;
252
253 hslot2 = udp_hashslot2(udptable, slot2);
254 if (hslot->count < hslot2->count)
255 goto scan_primary_hash;
256
257 exist = udp_lib_lport_inuse2(net, snum, hslot2,
258 sk, saddr_comp);
259 if (!exist && (hash2_nulladdr != slot2)) {
260 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
261 exist = udp_lib_lport_inuse2(net, snum, hslot2,
262 sk, saddr_comp);
263 }
264 if (exist)
265 goto fail_unlock;
266 else
267 goto found;
268 }
269scan_primary_hash:
270 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
271 saddr_comp, 0))
210 goto fail_unlock; 272 goto fail_unlock;
211 } 273 }
212found: 274found:
213 inet_sk(sk)->num = snum; 275 inet_sk(sk)->inet_num = snum;
214 sk->sk_hash = snum; 276 udp_sk(sk)->udp_port_hash = snum;
277 udp_sk(sk)->udp_portaddr_hash ^= snum;
215 if (sk_unhashed(sk)) { 278 if (sk_unhashed(sk)) {
216 sk_nulls_add_node_rcu(sk, &hslot->head); 279 sk_nulls_add_node_rcu(sk, &hslot->head);
280 hslot->count++;
217 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 281 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
282
283 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
284 spin_lock(&hslot2->lock);
285 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
286 &hslot2->head);
287 hslot2->count++;
288 spin_unlock(&hslot2->lock);
218 } 289 }
219 error = 0; 290 error = 0;
220fail_unlock: 291fail_unlock:
@@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
229 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 300 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
230 301
231 return (!ipv6_only_sock(sk2) && 302 return (!ipv6_only_sock(sk2) &&
232 (!inet1->rcv_saddr || !inet2->rcv_saddr || 303 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
233 inet1->rcv_saddr == inet2->rcv_saddr)); 304 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
305}
306
307static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
308 unsigned int port)
309{
310 return jhash_1word(saddr, net_hash_mix(net)) ^ port;
234} 311}
235 312
236int udp_v4_get_port(struct sock *sk, unsigned short snum) 313int udp_v4_get_port(struct sock *sk, unsigned short snum)
237{ 314{
238 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 315 unsigned int hash2_nulladdr =
316 udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
317 unsigned int hash2_partial =
318 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
319
320 /* precompute partial secondary hash */
321 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
322 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
239} 323}
240 324
241static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, 325static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -244,23 +328,61 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
244{ 328{
245 int score = -1; 329 int score = -1;
246 330
247 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 331 if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
248 !ipv6_only_sock(sk)) { 332 !ipv6_only_sock(sk)) {
249 struct inet_sock *inet = inet_sk(sk); 333 struct inet_sock *inet = inet_sk(sk);
250 334
251 score = (sk->sk_family == PF_INET ? 1 : 0); 335 score = (sk->sk_family == PF_INET ? 1 : 0);
252 if (inet->rcv_saddr) { 336 if (inet->inet_rcv_saddr) {
253 if (inet->rcv_saddr != daddr) 337 if (inet->inet_rcv_saddr != daddr)
338 return -1;
339 score += 2;
340 }
341 if (inet->inet_daddr) {
342 if (inet->inet_daddr != saddr)
343 return -1;
344 score += 2;
345 }
346 if (inet->inet_dport) {
347 if (inet->inet_dport != sport)
348 return -1;
349 score += 2;
350 }
351 if (sk->sk_bound_dev_if) {
352 if (sk->sk_bound_dev_if != dif)
254 return -1; 353 return -1;
255 score += 2; 354 score += 2;
256 } 355 }
257 if (inet->daddr) { 356 }
258 if (inet->daddr != saddr) 357 return score;
358}
359
360/*
361 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
362 */
363#define SCORE2_MAX (1 + 2 + 2 + 2)
364static inline int compute_score2(struct sock *sk, struct net *net,
365 __be32 saddr, __be16 sport,
366 __be32 daddr, unsigned int hnum, int dif)
367{
368 int score = -1;
369
370 if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
371 struct inet_sock *inet = inet_sk(sk);
372
373 if (inet->inet_rcv_saddr != daddr)
374 return -1;
375 if (inet->inet_num != hnum)
376 return -1;
377
378 score = (sk->sk_family == PF_INET ? 1 : 0);
379 if (inet->inet_daddr) {
380 if (inet->inet_daddr != saddr)
259 return -1; 381 return -1;
260 score += 2; 382 score += 2;
261 } 383 }
262 if (inet->dport) { 384 if (inet->inet_dport) {
263 if (inet->dport != sport) 385 if (inet->inet_dport != sport)
264 return -1; 386 return -1;
265 score += 2; 387 score += 2;
266 } 388 }
@@ -273,6 +395,51 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
273 return score; 395 return score;
274} 396}
275 397
398
399/* called with read_rcu_lock() */
400static struct sock *udp4_lib_lookup2(struct net *net,
401 __be32 saddr, __be16 sport,
402 __be32 daddr, unsigned int hnum, int dif,
403 struct udp_hslot *hslot2, unsigned int slot2)
404{
405 struct sock *sk, *result;
406 struct hlist_nulls_node *node;
407 int score, badness;
408
409begin:
410 result = NULL;
411 badness = -1;
412 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
413 score = compute_score2(sk, net, saddr, sport,
414 daddr, hnum, dif);
415 if (score > badness) {
416 result = sk;
417 badness = score;
418 if (score == SCORE2_MAX)
419 goto exact_match;
420 }
421 }
422 /*
423 * if the nulls value we got at the end of this lookup is
424 * not the expected one, we must restart lookup.
425 * We probably met an item that was moved to another chain.
426 */
427 if (get_nulls_value(node) != slot2)
428 goto begin;
429
430 if (result) {
431exact_match:
432 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
433 result = NULL;
434 else if (unlikely(compute_score2(result, net, saddr, sport,
435 daddr, hnum, dif) < badness)) {
436 sock_put(result);
437 goto begin;
438 }
439 }
440 return result;
441}
442
276/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 443/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
277 * harder than this. -DaveM 444 * harder than this. -DaveM
278 */ 445 */
@@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
283 struct sock *sk, *result; 450 struct sock *sk, *result;
284 struct hlist_nulls_node *node; 451 struct hlist_nulls_node *node;
285 unsigned short hnum = ntohs(dport); 452 unsigned short hnum = ntohs(dport);
286 unsigned int hash = udp_hashfn(net, hnum); 453 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
287 struct udp_hslot *hslot = &udptable->hash[hash]; 454 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
288 int score, badness; 455 int score, badness;
289 456
290 rcu_read_lock(); 457 rcu_read_lock();
458 if (hslot->count > 10) {
459 hash2 = udp4_portaddr_hash(net, daddr, hnum);
460 slot2 = hash2 & udptable->mask;
461 hslot2 = &udptable->hash2[slot2];
462 if (hslot->count < hslot2->count)
463 goto begin;
464
465 result = udp4_lib_lookup2(net, saddr, sport,
466 daddr, hnum, dif,
467 hslot2, slot2);
468 if (!result) {
469 hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
470 slot2 = hash2 & udptable->mask;
471 hslot2 = &udptable->hash2[slot2];
472 if (hslot->count < hslot2->count)
473 goto begin;
474
475 result = udp4_lib_lookup2(net, INADDR_ANY, sport,
476 daddr, hnum, dif,
477 hslot2, slot2);
478 }
479 rcu_read_unlock();
480 return result;
481 }
291begin: 482begin:
292 result = NULL; 483 result = NULL;
293 badness = -1; 484 badness = -1;
@@ -304,7 +495,7 @@ begin:
304 * not the expected one, we must restart lookup. 495 * not the expected one, we must restart lookup.
305 * We probably met an item that was moved to another chain. 496 * We probably met an item that was moved to another chain.
306 */ 497 */
307 if (get_nulls_value(node) != hash) 498 if (get_nulls_value(node) != slot)
308 goto begin; 499 goto begin;
309 500
310 if (result) { 501 if (result) {
@@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
354 sk_nulls_for_each_from(s, node) { 545 sk_nulls_for_each_from(s, node) {
355 struct inet_sock *inet = inet_sk(s); 546 struct inet_sock *inet = inet_sk(s);
356 547
357 if (!net_eq(sock_net(s), net) || 548 if (!net_eq(sock_net(s), net) ||
358 s->sk_hash != hnum || 549 udp_sk(s)->udp_port_hash != hnum ||
359 (inet->daddr && inet->daddr != rmt_addr) || 550 (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
360 (inet->dport != rmt_port && inet->dport) || 551 (inet->inet_dport != rmt_port && inet->inet_dport) ||
361 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 552 (inet->inet_rcv_saddr &&
362 ipv6_only_sock(s) || 553 inet->inet_rcv_saddr != loc_addr) ||
554 ipv6_only_sock(s) ||
363 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) 555 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
364 continue; 556 continue;
365 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) 557 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
642 } else { 834 } else {
643 if (sk->sk_state != TCP_ESTABLISHED) 835 if (sk->sk_state != TCP_ESTABLISHED)
644 return -EDESTADDRREQ; 836 return -EDESTADDRREQ;
645 daddr = inet->daddr; 837 daddr = inet->inet_daddr;
646 dport = inet->dport; 838 dport = inet->inet_dport;
647 /* Open fast path for connected socket. 839 /* Open fast path for connected socket.
648 Route will not be used, if at least one option is set. 840 Route will not be used, if at least one option is set.
649 */ 841 */
650 connected = 1; 842 connected = 1;
651 } 843 }
652 ipc.addr = inet->saddr; 844 ipc.addr = inet->inet_saddr;
653 845
654 ipc.oif = sk->sk_bound_dev_if; 846 ipc.oif = sk->sk_bound_dev_if;
655 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 847 err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
704 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
705 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
706 .uli_u = { .ports = 898 .uli_u = { .ports =
707 { .sport = inet->sport, 899 { .sport = inet->inet_sport,
708 .dport = dport } } }; 900 .dport = dport } } };
709 struct net *net = sock_net(sk); 901 struct net *net = sock_net(sk);
710 902
@@ -748,7 +940,7 @@ back_from_confirm:
748 inet->cork.fl.fl4_dst = daddr; 940 inet->cork.fl.fl4_dst = daddr;
749 inet->cork.fl.fl_ip_dport = dport; 941 inet->cork.fl.fl_ip_dport = dport;
750 inet->cork.fl.fl4_src = saddr; 942 inet->cork.fl.fl4_src = saddr;
751 inet->cork.fl.fl_ip_sport = inet->sport; 943 inet->cork.fl.fl_ip_sport = inet->inet_sport;
752 up->pending = AF_INET; 944 up->pending = AF_INET;
753 945
754do_append_data: 946do_append_data:
@@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk)
862 udp_lib_checksum_complete(skb)) { 1054 udp_lib_checksum_complete(skb)) {
863 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, 1055 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
864 IS_UDPLITE(sk)); 1056 IS_UDPLITE(sk));
1057 atomic_inc(&sk->sk_drops);
865 __skb_unlink(skb, rcvq); 1058 __skb_unlink(skb, rcvq);
866 __skb_queue_tail(&list_kill, skb); 1059 __skb_queue_tail(&list_kill, skb);
867 } 1060 }
@@ -982,7 +1175,7 @@ try_again:
982 UDP_INC_STATS_USER(sock_net(sk), 1175 UDP_INC_STATS_USER(sock_net(sk),
983 UDP_MIB_INDATAGRAMS, is_udplite); 1176 UDP_MIB_INDATAGRAMS, is_udplite);
984 1177
985 sock_recv_timestamp(msg, sk, skb); 1178 sock_recv_ts_and_drops(msg, sk, skb);
986 1179
987 /* Copy the address. */ 1180 /* Copy the address. */
988 if (sin) { 1181 if (sin) {
@@ -1023,15 +1216,15 @@ int udp_disconnect(struct sock *sk, int flags)
1023 */ 1216 */
1024 1217
1025 sk->sk_state = TCP_CLOSE; 1218 sk->sk_state = TCP_CLOSE;
1026 inet->daddr = 0; 1219 inet->inet_daddr = 0;
1027 inet->dport = 0; 1220 inet->inet_dport = 0;
1028 sk->sk_bound_dev_if = 0; 1221 sk->sk_bound_dev_if = 0;
1029 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1222 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1030 inet_reset_saddr(sk); 1223 inet_reset_saddr(sk);
1031 1224
1032 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { 1225 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
1033 sk->sk_prot->unhash(sk); 1226 sk->sk_prot->unhash(sk);
1034 inet->sport = 0; 1227 inet->inet_sport = 0;
1035 } 1228 }
1036 sk_dst_reset(sk); 1229 sk_dst_reset(sk);
1037 return 0; 1230 return 0;
@@ -1042,13 +1235,22 @@ void udp_lib_unhash(struct sock *sk)
1042{ 1235{
1043 if (sk_hashed(sk)) { 1236 if (sk_hashed(sk)) {
1044 struct udp_table *udptable = sk->sk_prot->h.udp_table; 1237 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1045 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); 1238 struct udp_hslot *hslot, *hslot2;
1046 struct udp_hslot *hslot = &udptable->hash[hash]; 1239
1240 hslot = udp_hashslot(udptable, sock_net(sk),
1241 udp_sk(sk)->udp_port_hash);
1242 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1047 1243
1048 spin_lock_bh(&hslot->lock); 1244 spin_lock_bh(&hslot->lock);
1049 if (sk_nulls_del_node_init_rcu(sk)) { 1245 if (sk_nulls_del_node_init_rcu(sk)) {
1050 inet_sk(sk)->num = 0; 1246 hslot->count--;
1247 inet_sk(sk)->inet_num = 0;
1051 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 1248 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
1249
1250 spin_lock(&hslot2->lock);
1251 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1252 hslot2->count--;
1253 spin_unlock(&hslot2->lock);
1052 } 1254 }
1053 spin_unlock_bh(&hslot->lock); 1255 spin_unlock_bh(&hslot->lock);
1054 } 1256 }
@@ -1057,25 +1259,22 @@ EXPORT_SYMBOL(udp_lib_unhash);
1057 1259
1058static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1260static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1059{ 1261{
1060 int is_udplite = IS_UDPLITE(sk); 1262 int rc = sock_queue_rcv_skb(sk, skb);
1061 int rc; 1263
1264 if (rc < 0) {
1265 int is_udplite = IS_UDPLITE(sk);
1062 1266
1063 if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
1064 /* Note that an ENOMEM error is charged twice */ 1267 /* Note that an ENOMEM error is charged twice */
1065 if (rc == -ENOMEM) { 1268 if (rc == -ENOMEM)
1066 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1269 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1067 is_udplite); 1270 is_udplite);
1068 atomic_inc(&sk->sk_drops); 1271 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1069 } 1272 kfree_skb(skb);
1070 goto drop; 1273 return -1;
1071 } 1274 }
1072 1275
1073 return 0; 1276 return 0;
1074 1277
1075drop:
1076 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1077 kfree_skb(skb);
1078 return -1;
1079} 1278}
1080 1279
1081/* returns: 1280/* returns:
@@ -1182,53 +1381,88 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1182 1381
1183drop: 1382drop:
1184 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1383 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1384 atomic_inc(&sk->sk_drops);
1185 kfree_skb(skb); 1385 kfree_skb(skb);
1186 return -1; 1386 return -1;
1187} 1387}
1188 1388
1389
1390static void flush_stack(struct sock **stack, unsigned int count,
1391 struct sk_buff *skb, unsigned int final)
1392{
1393 unsigned int i;
1394 struct sk_buff *skb1 = NULL;
1395 struct sock *sk;
1396
1397 for (i = 0; i < count; i++) {
1398 sk = stack[i];
1399 if (likely(skb1 == NULL))
1400 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
1401
1402 if (!skb1) {
1403 atomic_inc(&sk->sk_drops);
1404 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1405 IS_UDPLITE(sk));
1406 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
1407 IS_UDPLITE(sk));
1408 }
1409
1410 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1411 skb1 = NULL;
1412 }
1413 if (unlikely(skb1))
1414 kfree_skb(skb1);
1415}
1416
1189/* 1417/*
1190 * Multicasts and broadcasts go to each listener. 1418 * Multicasts and broadcasts go to each listener.
1191 * 1419 *
1192 * Note: called only from the BH handler context, 1420 * Note: called only from the BH handler context.
1193 * so we don't need to lock the hashes.
1194 */ 1421 */
1195static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1422static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1196 struct udphdr *uh, 1423 struct udphdr *uh,
1197 __be32 saddr, __be32 daddr, 1424 __be32 saddr, __be32 daddr,
1198 struct udp_table *udptable) 1425 struct udp_table *udptable)
1199{ 1426{
1200 struct sock *sk; 1427 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1201 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; 1428 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
1202 int dif; 1429 int dif;
1430 unsigned int i, count = 0;
1203 1431
1204 spin_lock(&hslot->lock); 1432 spin_lock(&hslot->lock);
1205 sk = sk_nulls_head(&hslot->head); 1433 sk = sk_nulls_head(&hslot->head);
1206 dif = skb->dev->ifindex; 1434 dif = skb->dev->ifindex;
1207 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1435 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1208 if (sk) { 1436 while (sk) {
1209 struct sock *sknext = NULL; 1437 stack[count++] = sk;
1210 1438 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1211 do { 1439 daddr, uh->source, saddr, dif);
1212 struct sk_buff *skb1 = skb; 1440 if (unlikely(count == ARRAY_SIZE(stack))) {
1213 1441 if (!sk)
1214 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, 1442 break;
1215 daddr, uh->source, saddr, 1443 flush_stack(stack, count, skb, ~0);
1216 dif); 1444 count = 0;
1217 if (sknext) 1445 }
1218 skb1 = skb_clone(skb, GFP_ATOMIC); 1446 }
1219 1447 /*
1220 if (skb1) { 1448 * before releasing chain lock, we must take a reference on sockets
1221 int ret = udp_queue_rcv_skb(sk, skb1); 1449 */
1222 if (ret > 0) 1450 for (i = 0; i < count; i++)
1223 /* we should probably re-process instead 1451 sock_hold(stack[i]);
1224 * of dropping packets here. */ 1452
1225 kfree_skb(skb1);
1226 }
1227 sk = sknext;
1228 } while (sknext);
1229 } else
1230 consume_skb(skb);
1231 spin_unlock(&hslot->lock); 1453 spin_unlock(&hslot->lock);
1454
1455 /*
1456 * do the slow work with no lock held
1457 */
1458 if (count) {
1459 flush_stack(stack, count, skb, count - 1);
1460
1461 for (i = 0; i < count; i++)
1462 sock_put(stack[i]);
1463 } else {
1464 kfree_skb(skb);
1465 }
1232 return 0; 1466 return 0;
1233} 1467}
1234 1468
@@ -1620,9 +1854,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
1620 struct udp_iter_state *state = seq->private; 1854 struct udp_iter_state *state = seq->private;
1621 struct net *net = seq_file_net(seq); 1855 struct net *net = seq_file_net(seq);
1622 1856
1623 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1857 for (state->bucket = start; state->bucket <= state->udp_table->mask;
1858 ++state->bucket) {
1624 struct hlist_nulls_node *node; 1859 struct hlist_nulls_node *node;
1625 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; 1860 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1861
1862 if (hlist_nulls_empty(&hslot->head))
1863 continue;
1864
1626 spin_lock_bh(&hslot->lock); 1865 spin_lock_bh(&hslot->lock);
1627 sk_nulls_for_each(sk, node, &hslot->head) { 1866 sk_nulls_for_each(sk, node, &hslot->head) {
1628 if (!net_eq(sock_net(sk), net)) 1867 if (!net_eq(sock_net(sk), net))
@@ -1647,7 +1886,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1647 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1886 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1648 1887
1649 if (!sk) { 1888 if (!sk) {
1650 if (state->bucket < UDP_HTABLE_SIZE) 1889 if (state->bucket <= state->udp_table->mask)
1651 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1890 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1652 return udp_get_first(seq, state->bucket + 1); 1891 return udp_get_first(seq, state->bucket + 1);
1653 } 1892 }
@@ -1667,7 +1906,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1667static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1906static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1668{ 1907{
1669 struct udp_iter_state *state = seq->private; 1908 struct udp_iter_state *state = seq->private;
1670 state->bucket = UDP_HTABLE_SIZE; 1909 state->bucket = MAX_UDP_PORTS;
1671 1910
1672 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1911 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1673} 1912}
@@ -1689,7 +1928,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
1689{ 1928{
1690 struct udp_iter_state *state = seq->private; 1929 struct udp_iter_state *state = seq->private;
1691 1930
1692 if (state->bucket < UDP_HTABLE_SIZE) 1931 if (state->bucket <= state->udp_table->mask)
1693 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1932 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1694} 1933}
1695 1934
@@ -1744,12 +1983,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
1744 int bucket, int *len) 1983 int bucket, int *len)
1745{ 1984{
1746 struct inet_sock *inet = inet_sk(sp); 1985 struct inet_sock *inet = inet_sk(sp);
1747 __be32 dest = inet->daddr; 1986 __be32 dest = inet->inet_daddr;
1748 __be32 src = inet->rcv_saddr; 1987 __be32 src = inet->inet_rcv_saddr;
1749 __u16 destp = ntohs(inet->dport); 1988 __u16 destp = ntohs(inet->inet_dport);
1750 __u16 srcp = ntohs(inet->sport); 1989 __u16 srcp = ntohs(inet->inet_sport);
1751 1990
1752 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 1991 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1753 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", 1992 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1754 bucket, src, srcp, dest, destp, sp->sk_state, 1993 bucket, src, srcp, dest, destp, sp->sk_state,
1755 sk_wmem_alloc_get(sp), 1994 sk_wmem_alloc_get(sp),
@@ -1815,21 +2054,60 @@ void udp4_proc_exit(void)
1815} 2054}
1816#endif /* CONFIG_PROC_FS */ 2055#endif /* CONFIG_PROC_FS */
1817 2056
1818void __init udp_table_init(struct udp_table *table) 2057static __initdata unsigned long uhash_entries;
2058static int __init set_uhash_entries(char *str)
1819{ 2059{
1820 int i; 2060 if (!str)
2061 return 0;
2062 uhash_entries = simple_strtoul(str, &str, 0);
2063 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2064 uhash_entries = UDP_HTABLE_SIZE_MIN;
2065 return 1;
2066}
2067__setup("uhash_entries=", set_uhash_entries);
1821 2068
1822 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 2069void __init udp_table_init(struct udp_table *table, const char *name)
2070{
2071 unsigned int i;
2072
2073 if (!CONFIG_BASE_SMALL)
2074 table->hash = alloc_large_system_hash(name,
2075 2 * sizeof(struct udp_hslot),
2076 uhash_entries,
2077 21, /* one slot per 2 MB */
2078 0,
2079 &table->log,
2080 &table->mask,
2081 64 * 1024);
2082 /*
2083 * Make sure hash table has the minimum size
2084 */
2085 if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
2086 table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
2087 2 * sizeof(struct udp_hslot), GFP_KERNEL);
2088 if (!table->hash)
2089 panic(name);
2090 table->log = ilog2(UDP_HTABLE_SIZE_MIN);
2091 table->mask = UDP_HTABLE_SIZE_MIN - 1;
2092 }
2093 table->hash2 = table->hash + (table->mask + 1);
2094 for (i = 0; i <= table->mask; i++) {
1823 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); 2095 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
2096 table->hash[i].count = 0;
1824 spin_lock_init(&table->hash[i].lock); 2097 spin_lock_init(&table->hash[i].lock);
1825 } 2098 }
2099 for (i = 0; i <= table->mask; i++) {
2100 INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
2101 table->hash2[i].count = 0;
2102 spin_lock_init(&table->hash2[i].lock);
2103 }
1826} 2104}
1827 2105
1828void __init udp_init(void) 2106void __init udp_init(void)
1829{ 2107{
1830 unsigned long nr_pages, limit; 2108 unsigned long nr_pages, limit;
1831 2109
1832 udp_table_init(&udp_table); 2110 udp_table_init(&udp_table, "UDP");
1833 /* Set the pressure threshold up by the same strategy of TCP. It is a 2111 /* Set the pressure threshold up by the same strategy of TCP. It is a
1834 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 2112 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1835 * toward zero with the amount of memory, with a floor of 128 pages. 2113 * toward zero with the amount of memory, with a floor of 128 pages.
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 95248d7f75ec..66f79513f4a5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,7 +12,7 @@
12 */ 12 */
13#include "udp_impl.h" 13#include "udp_impl.h"
14 14
15struct udp_table udplite_table; 15struct udp_table udplite_table __read_mostly;
16EXPORT_SYMBOL(udplite_table); 16EXPORT_SYMBOL(udplite_table);
17 17
18static int udplite_rcv(struct sk_buff *skb) 18static int udplite_rcv(struct sk_buff *skb)
@@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = {
64 .protocol = IPPROTO_UDPLITE, 64 .protocol = IPPROTO_UDPLITE,
65 .prot = &udplite_prot, 65 .prot = &udplite_prot,
66 .ops = &inet_dgram_ops, 66 .ops = &inet_dgram_ops,
67 .capability = -1,
68 .no_check = 0, /* must checksum (RFC 3828) */ 67 .no_check = 0, /* must checksum (RFC 3828) */
69 .flags = INET_PROTOSW_PERMANENT, 68 .flags = INET_PROTOSW_PERMANENT,
70}; 69};
@@ -110,7 +109,7 @@ static inline int udplite4_proc_init(void)
110 109
111void __init udplite4_register(void) 110void __init udplite4_register(void)
112{ 111{
113 udp_table_init(&udplite_table); 112 udp_table_init(&udplite_table, "UDP-Lite");
114 if (proto_register(&udplite_prot, 1)) 113 if (proto_register(&udplite_prot, 1))
115 goto out_register_err; 114 goto out_register_err;
116 115