Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/ipv4
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
96 files changed, 6506 insertions, 5512 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7cd7760144f7..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
          If unsure, say N here.
-choice
-        prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
-        depends on IP_ADVANCED_ROUTER
-        default ASK_IP_FIB_HASH
-config ASK_IP_FIB_HASH
-        bool "FIB_HASH"
-        ---help---
-          Current FIB is very proven and good enough for most users.
-config IP_FIB_TRIE
-        bool "FIB_TRIE"
-        ---help---
-          Use new experimental LC-trie as FIB lookup algorithm.
-          This improves lookup performance if you have a large
-          number of routes.
-          LC-trie is a longest matching prefix lookup algorithm which
-          performs better than FIB_HASH for large routing tables.
-          But, it consumes more memory and is more complex.
-          LC-trie is described in:
-          IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
-          IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
-          June 1999
-          An experimental study of compression methods for dynamic tries
-          Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
-          http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-endchoice
-config IP_FIB_HASH
-        def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
 config IP_FIB_TRIE_STATS
        bool "FIB TRIE statistics"
-        depends on IP_FIB_TRIE
+        depends on IP_ADVANCED_ROUTER
        ---help---
          Keep track of statistics on structure of FIB TRIE table.
          Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
          handled by the klogd daemon which is responsible for kernel messages
          ("man klogd").
+config IP_ROUTE_CLASSID
+        bool
 config IP_PNP
        bool "IP: kernel level autoconfiguration"
        help
@@ -215,9 +182,15 @@ config NET_IPIP
          be inserted in and removed from the running kernel whenever you
          want). Most people won't need this and can say N.
+config NET_IPGRE_DEMUX
+        tristate "IP: GRE demultiplexer"
+        help
+         This is helper module to demultiplex GRE packets on GRE version field criteria.
+         Required by ip_gre and pptp modules.
 config NET_IPGRE
        tristate "IP: GRE tunnels over IP"
-        depends on IPV6 || IPV6=n
+        depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
        help
          Tunneling means encapsulating data of one protocol type within
          another protocol and sending it over a channel that understands the
@@ -426,7 +399,9 @@ config INET_DIAG
        ---help---
          Support for INET (TCP, DCCP, etc) socket monitoring interface used by
          native Linux tools such as ss. ss is included in iproute2, currently
-          downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
+          downloadable at:
+          
+            http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
          If unsure, say Y.
@@ -556,7 +531,7 @@ config TCP_CONG_VENO
        distinguishing to circumvent the difficult judgment of the packet loss
        type. TCP Veno cuts down less congestion window in response to random
        loss packets.
-        See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+        See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 
 config TCP_CONG_YEAH
        tristate "YeAH TCP"
@@ -649,4 +624,3 @@ config TCP_MD5SIG
          on the Internet.
          If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..f2dc69cffb57 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,16 +10,15 @@ obj-y     := route.o inetpeer.o protocol.o \
             tcp_minisocks.o tcp_cong.o \
             datagram.o raw.o udp.o udplite.o \
             arp.o icmp.o devinet.o af_inet.o  igmp.o \
-             fib_frontend.o fib_semantics.o \
+             fib_frontend.o fib_semantics.o fib_trie.o \
-             inet_fragment.o
+             inet_fragment.o ping.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9f..ef1528af7abf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -105,6 +105,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/udplite.h>
+#include <net/ping.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/raw.h>
@@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
        WARN_ON(sk->sk_wmem_queued);
        WARN_ON(sk->sk_forward_alloc);
-        kfree(inet->opt);
+        kfree(rcu_dereference_protected(inet->inet_opt, 1));
        dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
        sk_refcnt_debug_dec(sk);
 }
@@ -227,18 +228,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
 /*
 * inet_ehash_secret must be set exactly once
- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
 */
 void build_ehash_secret(void)
 {
        u32 rnd;
        do {
                get_random_bytes(&rnd, sizeof(rnd));
        } while (rnd == 0);
-        spin_lock_bh(&inetsw_lock);
-        if (!inet_ehash_secret)
+        cmpxchg(&inet_ehash_secret, 0, rnd);
-                inet_ehash_secret = rnd;
-        spin_unlock_bh(&inetsw_lock);
 }
 EXPORT_SYMBOL(build_ehash_secret);
@@ -466,6 +465,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
        if (addr_len < sizeof(struct sockaddr_in))
                goto out;
+        if (addr->sin_family != AF_INET) {
+                err = -EAFNOSUPPORT;
+                goto out;
+        }
        chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
        /* Not specified by any standard per-se, however it breaks too
@@ -674,6 +678,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
        lock_sock(sk2);
+        sock_rps_record_flow(sk2);
        WARN_ON(!((1 << sk2->sk_state) &
                  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
@@ -882,6 +887,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 }
 EXPORT_SYMBOL(inet_ioctl);
+#ifdef CONFIG_COMPAT
+int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+        struct sock *sk = sock->sk;
+        int err = -ENOIOCTLCMD;
+        if (sk->sk_prot->compat_ioctl)
+                err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+        return err;
+}
+#endif
 const struct proto_ops inet_stream_ops = {
        .family            = PF_INET,
        .owner             = THIS_MODULE,
@@ -905,6 +923,7 @@ const struct proto_ops inet_stream_ops = {
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_sock_common_setsockopt,
        .compat_getsockopt = compat_sock_common_getsockopt,
+        .compat_ioctl      = inet_compat_ioctl,
 #endif
 };
 EXPORT_SYMBOL(inet_stream_ops);
@@ -931,6 +950,7 @@ const struct proto_ops inet_dgram_ops = {
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_sock_common_setsockopt,
        .compat_getsockopt = compat_sock_common_getsockopt,
+        .compat_ioctl      = inet_compat_ioctl,
 #endif
 };
 EXPORT_SYMBOL(inet_dgram_ops);
@@ -961,6 +981,7 @@ static const struct proto_ops inet_sockraw_ops = {
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_sock_common_setsockopt,
        .compat_getsockopt = compat_sock_common_getsockopt,
+        .compat_ioctl      = inet_compat_ioctl,
 #endif
 };
@@ -994,6 +1015,14 @@ static struct inet_protosw inetsw_array[] =
                .flags =      INET_PROTOSW_PERMANENT,
       },
+       {
+                .type =       SOCK_DGRAM,
+                .protocol =   IPPROTO_ICMP,
+                .prot =       &ping_prot,
+                .ops =        &inet_dgram_ops,
+                .no_check =   UDP_CSUM_DEFAULT,
+                .flags =      INET_PROTOSW_REUSE,
+       },
       {
               .type =       SOCK_RAW,
@@ -1087,27 +1116,29 @@ int sysctl_ip_dynaddr __read_mostly;
 static int inet_sk_reselect_saddr(struct sock *sk)
 {
        struct inet_sock *inet = inet_sk(sk);
-        int err;
-        struct rtable *rt;
        __be32 old_saddr = inet->inet_saddr;
-        __be32 new_saddr;
        __be32 daddr = inet->inet_daddr;
+        struct flowi4 *fl4;
+        struct rtable *rt;
+        __be32 new_saddr;
+        struct ip_options_rcu *inet_opt;
-        if (inet->opt && inet->opt->srr)
+        inet_opt = rcu_dereference_protected(inet->inet_opt,
-                daddr = inet->opt->faddr;
+                                             sock_owned_by_user(sk));
+        if (inet_opt && inet_opt->opt.srr)
+                daddr = inet_opt->opt.faddr;
        /* Query new route. */
-        err = ip_route_connect(&rt, daddr, 0,
+        fl4 = &inet->cork.fl.u.ip4;
-                               RT_CONN_FLAGS(sk),
+        rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
-                               sk->sk_bound_dev_if,
+                              sk->sk_bound_dev_if, sk->sk_protocol,
-                               sk->sk_protocol,
+                              inet->inet_sport, inet->inet_dport, sk, false);
-                               inet->inet_sport, inet->inet_dport, sk, 0);
+        if (IS_ERR(rt))
-        if (err)
+                return PTR_ERR(rt);
-                return err;
        sk_setup_caps(sk, &rt->dst);
-        new_saddr = rt->rt_src;
+        new_saddr = fl4->saddr;
        if (new_saddr == old_saddr)
                return 0;
@@ -1136,6 +1167,8 @@ int inet_sk_rebuild_header(struct sock *sk)
        struct inet_sock *inet = inet_sk(sk);
        struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
        __be32 daddr;
+        struct ip_options_rcu *inet_opt;
+        struct flowi4 *fl4;
        int err;
        /* Route is OK, nothing to do. */
@@ -1143,36 +1176,23 @@ int inet_sk_rebuild_header(struct sock *sk)
                return 0;
        /* Reroute. */
+        rcu_read_lock();
+        inet_opt = rcu_dereference(inet->inet_opt);
        daddr = inet->inet_daddr;
-        if (inet->opt && inet->opt->srr)
+        if (inet_opt && inet_opt->opt.srr)
-                daddr = inet->opt->faddr;
+                daddr = inet_opt->opt.faddr;
-{
+        rcu_read_unlock();
-        struct flowi fl = {
+        fl4 = &inet->cork.fl.u.ip4;
-                .oif = sk->sk_bound_dev_if,
+        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
-                .mark = sk->sk_mark,
+                                   inet->inet_dport, inet->inet_sport,
-                .nl_u = {
+                                   sk->sk_protocol, RT_CONN_FLAGS(sk),
-                        .ip4_u = {
+                                   sk->sk_bound_dev_if);
-                                .daddr  = daddr,
+        if (!IS_ERR(rt)) {
-                                .saddr  = inet->inet_saddr,
+                err = 0;
-                                .tos    = RT_CONN_FLAGS(sk),
-                        },
-                },
-                .proto = sk->sk_protocol,
-                .flags = inet_sk_flowi_flags(sk),
-                .uli_u = {
-                        .ports = {
-                                .sport = inet->inet_sport,
-                                .dport = inet->inet_dport,
-                        },
-                },
-        };
-        security_sk_classify_flow(sk, &fl);
-        err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
-}
-        if (!err)
                sk_setup_caps(sk, &rt->dst);
-        else {
+        } else {
+                err = PTR_ERR(rt);
                /* Routing failed... */
                sk->sk_route_caps = 0;
                /*
@@ -1192,7 +1212,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-        struct iphdr *iph;
+        const struct iphdr *iph;
        const struct net_protocol *ops;
        int proto;
        int ihl;
@@ -1225,7 +1245,7 @@ out:
        return err;
 }
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        struct iphdr *iph;
@@ -1299,7 +1319,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
        const struct net_protocol *ops;
        struct sk_buff **pp = NULL;
        struct sk_buff *p;
-        struct iphdr *iph;
+        const struct iphdr *iph;
        unsigned int hlen;
        unsigned int off;
        unsigned int id;
@@ -1522,6 +1542,7 @@ static const struct net_protocol udp_protocol = {
 static const struct net_protocol icmp_protocol = {
        .handler =      icmp_rcv,
+        .err_handler =  ping_err,
        .no_policy =    1,
        .netns_ok =     1,
 };
@@ -1637,6 +1658,10 @@ static int __init inet_init(void)
        if (rc)
                goto out_unregister_udp_proto;
+        rc = proto_register(&ping_prot, 1);
+        if (rc)
+                goto out_unregister_raw_proto;
        /*
         *      Tell SOCKET that we are alive...
         */
@@ -1692,6 +1717,8 @@ static int __init inet_init(void)
        /* Add UDP-Lite (RFC 3828) */
        udplite4_register();
+        ping_init();
        /*
         *      Set the ICMP layer up
         */
@@ -1722,6 +1749,8 @@ static int __init inet_init(void)
        rc = 0;
 out:
        return rc;
+out_unregister_raw_proto:
+        proto_unregister(&raw_prot);
 out_unregister_udp_proto:
        proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
@@ -1746,11 +1775,15 @@ static int __init ipv4_proc_init(void)
                goto out_tcp;
        if (udp4_proc_init())
                goto out_udp;
+        if (ping_proc_init())
+                goto out_ping;
        if (ip_misc_proc_init())
                goto out_misc;
 out:
        return rc;
 out_misc:
+        ping_proc_exit();
+out_ping:
        udp4_proc_exit();
 out_udp:
        tcp4_proc_exit();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 880a5ec6dce0..c1f4154552fc 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -73,7 +73,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
 * into IP header for icv calculation. Options are already checked
 * for validity, so paranoia is not required. */
-static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
 {
        unsigned char * optptr = (unsigned char*)(iph+1);
        int  l = iph->ihl*4 - sizeof(struct iphdr);
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
        top_iph->ttl = 0;
        top_iph->check = 0;
-        ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+        if (x->props.flags & XFRM_STATE_ALIGN4)
+                ah->hdrlen  = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+        else
+                ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
        ah->reserved = 0;
        ah->spi = x->id.spi;
-        ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+        ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
        sg_init_table(sg, nfrags);
        skb_to_sgvec(skb, sg, 0, skb->len);
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
        nexthdr = ah->nexthdr;
        ah_hlen = (ah->hdrlen + 2) << 2;
-        if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+        if (x->props.flags & XFRM_STATE_ALIGN4) {
-            ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+                if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
-                goto out;
+                    ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+                        goto out;
+        } else {
+                if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+                    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+                        goto out;
+        }
        if (!pskb_may_pull(skb, ah_hlen))
                goto out;
@@ -314,14 +323,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
        skb->ip_summed = CHECKSUM_NONE;
-        ah = (struct ip_auth_hdr *)skb->data;
-        iph = ip_hdr(skb);
-        ihl = ip_hdrlen(skb);
        if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
                goto out;
        nfrags = err;
+        ah = (struct ip_auth_hdr *)skb->data;
+        iph = ip_hdr(skb);
+        ihl = ip_hdrlen(skb);
        work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
        if (!work_iph)
                goto out;
@@ -386,7 +396,7 @@ out:
 static void ah4_err(struct sk_buff *skb, u32 info)
 {
        struct net *net = dev_net(skb->dev);
-        struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
@@ -394,7 +404,8 @@ static void ah4_err(struct sk_buff *skb, u32 info)
            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                return;
-        x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+                              ah->spi, IPPROTO_AH, AF_INET);
        if (!x)
                return;
        printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -449,8 +460,12 @@ static int ah_init_state(struct xfrm_state *x)
        BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
-        x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+        if (x->props.flags & XFRM_STATE_ALIGN4)
-                                          ahp->icv_trunc_len);
+                x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+                                                  ahp->icv_trunc_len);
+        else
+                x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+                                                  ahp->icv_trunc_len);
        if (x->props.mode == XFRM_MODE_TUNNEL)
                x->props.header_len += sizeof(struct iphdr);
        x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2f..1b74d3b64371 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
 *              Stuart Cheshire :       Metricom and grat arp fixes
 *                                      *** FOR 2.1 clean this up ***
 *              Lawrence V. Stefani: (08/12/96) Added FDDI support.
- *              Alan Cox        :       Took the AP1000 nasty FDDI hack and
+ *              Alan Cox        :       Took the AP1000 nasty FDDI hack and
 *                                      folded into the mainstream FDDI code.
 *                                      Ack spit, Linus how did you allow that
 *                                      one in...
@@ -120,14 +120,14 @@ EXPORT_SYMBOL(clip_tbl_hook);
 #endif
 #include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/netfilter_arp.h>
 /*
 *      Interface to generic neighbour cache.
 */
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
 static int arp_constructor(struct neighbour *neigh);
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
 static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
        .queue_xmit =           dev_queue_xmit,
 };
-const struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
        .family =               AF_INET,
        .solicit =              arp_solicit,
        .error_report =         arp_error_report,
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = {
        .hh_output =            dev_queue_xmit,
        .queue_xmit =           dev_queue_xmit,
 };
-EXPORT_SYMBOL(arp_broken_ops);
 struct neigh_table arp_tbl = {
-        .family =       AF_INET,
+        .family         = AF_INET,
-        .entry_size =   sizeof(struct neighbour) + 4,
+        .entry_size     = sizeof(struct neighbour) + 4,
-        .key_len =      4,
+        .key_len        = 4,
-        .hash =         arp_hash,
+        .hash           = arp_hash,
-        .constructor =  arp_constructor,
+        .constructor    = arp_constructor,
-        .proxy_redo =   parp_redo,
+        .proxy_redo     = parp_redo,
-        .id =           "arp_cache",
+        .id             = "arp_cache",
-        .parms = {
+        .parms          = {
-                .tbl =                  &arp_tbl,
+                .tbl                    = &arp_tbl,
-                .base_reachable_time =  30 * HZ,
+                .base_reachable_time    = 30 * HZ,
-                .retrans_time = 1 * HZ,
+                .retrans_time           = 1 * HZ,
-                .gc_staletime = 60 * HZ,
+                .gc_staletime           = 60 * HZ,
-                .reachable_time =               30 * HZ,
+                .reachable_time         = 30 * HZ,
-                .delay_probe_time =     5 * HZ,
+                .delay_probe_time       = 5 * HZ,
-                .queue_len =            3,
+                .queue_len              = 3,
-                .ucast_probes = 3,
+                .ucast_probes           = 3,
-                .mcast_probes = 3,
+                .mcast_probes           = 3,
-                .anycast_delay =        1 * HZ,
+                .anycast_delay          = 1 * HZ,
-                .proxy_delay =          (8 * HZ) / 10,
+                .proxy_delay            = (8 * HZ) / 10,
-                .proxy_qlen =           64,
+                .proxy_qlen             = 64,
-                .locktime =             1 * HZ,
+                .locktime               = 1 * HZ,
        },
-        .gc_interval =  30 * HZ,
+        .gc_interval    = 30 * HZ,
-        .gc_thresh1 =   128,
+        .gc_thresh1     = 128,
-        .gc_thresh2 =   512,
+        .gc_thresh2     = 512,
-        .gc_thresh3 =   1024,
+        .gc_thresh3     = 1024,
 };
 EXPORT_SYMBOL(arp_tbl);
@@ -216,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
        case ARPHRD_INFINIBAND:
                ip_ib_mc_map(addr, dev->broadcast, haddr);
                return 0;
+        case ARPHRD_IPGRE:
+                ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+                return 0;
        default:
                if (dir) {
                        memcpy(haddr, dev->broadcast, dev->addr_len);
@@ -226,14 +228,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 }
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+                    const struct net_device *dev,
+                    __u32 hash_rnd)
 {
-        return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+        return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
 }
 static int arp_constructor(struct neighbour *neigh)
 {
-        __be32 addr = *(__be32*)neigh->primary_key;
+        __be32 addr = *(__be32 *)neigh->primary_key;
        struct net_device *dev = neigh->dev;
        struct in_device *in_dev;
        struct neigh_parms *parms;
@@ -296,16 +300,19 @@ static int arp_constructor(struct neighbour *neigh)
                        neigh->ops = &arp_broken_ops;
                        neigh->output = neigh->ops->output;
                        return 0;
+#else
+                        break;
 #endif
-                ;}
+                }
 #endif
                if (neigh->type == RTN_MULTICAST) {
                        neigh->nud_state = NUD_NOARP;
                        arp_mc_map(addr, neigh->ha, dev, 1);
-                } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+                } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
-                } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+                } else if (neigh->type == RTN_BROADCAST ||
+                           (dev->flags & IFF_POINTOPOINT)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->broadcast, dev->addr_len);
                }
@@ -315,7 +322,7 @@ static int arp_constructor(struct neighbour *neigh)
                else
                        neigh->ops = &arp_generic_ops;
-                if (neigh->nud_state&NUD_VALID)
+                if (neigh->nud_state & NUD_VALID)
                        neigh->output = neigh->ops->connected_output;
                else
                        neigh->output = neigh->ops->output;
@@ -334,7 +341,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
        __be32 saddr = 0;
        u8  *dst_ha = NULL;
        struct net_device *dev = neigh->dev;
-        __be32 target = *(__be32*)neigh->primary_key;
+        __be32 target = *(__be32 *)neigh->primary_key;
        int probes = atomic_read(&neigh->probes);
        struct in_device *in_dev;
@@ -347,7 +354,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
        switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
        default:
        case 0:         /* By default announce any local IP */
-                if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL)
+                if (skb && inet_addr_type(dev_net(dev),
+                                          ip_hdr(skb)->saddr) == RTN_LOCAL)
                        saddr = ip_hdr(skb)->saddr;
                break;
        case 1:         /* Restrict announcements of saddr in same subnet */
@@ -369,16 +377,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
        if (!saddr)
                saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
-        if ((probes -= neigh->parms->ucast_probes) < 0) {
+        probes -= neigh->parms->ucast_probes;
-                if (!(neigh->nud_state&NUD_VALID))
+        if (probes < 0) {
-                        printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+                if (!(neigh->nud_state & NUD_VALID))
+                        printk(KERN_DEBUG
+                               "trying to ucast probe in NUD_INVALID\n");
                dst_ha = neigh->ha;
                read_lock_bh(&neigh->lock);
-        } else if ((probes -= neigh->parms->app_probes) < 0) {
+        } else {
+                probes -= neigh->parms->app_probes;
+                if (probes < 0) {
 #ifdef CONFIG_ARPD
-                neigh_app_ns(neigh);
+                        neigh_app_ns(neigh);
 #endif
-                return;
+                        return;
+                }
        }
        arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -423,14 +436,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
 static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 {
-        struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
-                                                 .saddr = tip } } };
        struct rtable *rt;
        int flag = 0;
        /*unsigned long now; */
        struct net *net = dev_net(dev);
-        if (ip_route_output_key(net, &rt, &fl) < 0)
+        rt = ip_route_output(net, sip, tip, 0, 0);
+        if (IS_ERR(rt))
                return 1;
        if (rt->dst.dev != dev) {
                NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -451,7 +463,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 *      is allowed to use this function, it is scheduled to be removed. --ANK
 */
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+                              __be32 paddr, struct net_device *dev)
 {
        switch (addr_hint) {
        case RTN_LOCAL:
@@ -483,17 +496,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
        paddr = skb_rtable(skb)->rt_gateway;
-        if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
+        if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+                               paddr, dev))
                return 0;
        n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
        if (n) {
                n->used = jiffies;
-                if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
+                if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
-                        read_lock_bh(&n->lock);
+                        neigh_ha_snapshot(haddr, n, dev);
-                        memcpy(haddr, n->ha, dev->addr_len);
-                        read_unlock_bh(&n->lock);
                        neigh_release(n);
                        return 0;
                }
@@ -515,13 +527,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
                return -EINVAL;
        if (n == NULL) {
                __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
-                if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+                if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
                        nexthop = 0;
                n = __neigh_lookup_errno(
 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-                    dev->type == ARPHRD_ATM ? clip_tbl_hook :
+                                         dev->type == ARPHRD_ATM ?
+                                         clip_tbl_hook :
 #endif
-                    &arp_tbl, &nexthop, dev);
+                                         &arp_tbl, &nexthop, dev);
                if (IS_ERR(n))
                        return PTR_ERR(n);
                dst->neighbour = n;
@@ -543,8 +556,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
        if (!IN_DEV_PROXY_ARP(in_dev))
                return 0;
+        imi = IN_DEV_MEDIUM_ID(in_dev);
-        if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+        if (imi == 0)
                return 1;
        if (imi == -1)
                return 0;
@@ -555,7 +568,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
        if (out_dev)
                omi = IN_DEV_MEDIUM_ID(out_dev);
-        return (omi != imi && omi != -1);
+        return omi != imi && omi != -1;
 }
 /*
@@ -685,7 +698,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
        arp->ar_pln = 4;
        arp->ar_op = htons(type);
-        arp_ptr=(unsigned char *)(arp+1);
+        arp_ptr = (unsigned char *)(arp + 1);
        memcpy(arp_ptr, src_hw, dev->addr_len);
        arp_ptr += dev->addr_len;
@@ -735,9 +748,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
        skb = arp_create(type, ptype, dest_ip, dev, src_ip,
                         dest_hw, src_hw, target_hw);
-        if (skb == NULL) {
+        if (skb == NULL)
                return;
-        }
        arp_xmit(skb);
 }
@@ -815,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
 /*
 *      Extract fields
 */
-        arp_ptr= (unsigned char *)(arp+1);
+        arp_ptr = (unsigned char *)(arp + 1);
        sha     = arp_ptr;
        arp_ptr += dev->addr_len;
        memcpy(&sip, arp_ptr, 4);
@@ -869,16 +881,17 @@ static int arp_process(struct sk_buff *skb)
                addr_type = rt->rt_type;
                if (addr_type == RTN_LOCAL) {
-                        int dont_send = 0;
+                        int dont_send;
-                        if (!dont_send)
+                        dont_send = arp_ignore(in_dev, sip, tip);
-                                dont_send |= arp_ignore(in_dev,sip,tip);
                        if (!dont_send && IN_DEV_ARPFILTER(in_dev))
-                                dont_send |= arp_filter(sip,tip,dev);
+                                dont_send = arp_filter(sip, tip, dev);
                        if (!dont_send) {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n) {
-                                        arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+                                        arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+                                                 dev, tip, sha, dev->dev_addr,
+                                                 sha);
                                        neigh_release(n);
                                }
                        }
@@ -887,8 +900,7 @@ static int arp_process(struct sk_buff *skb)
                        if (addr_type == RTN_UNICAST  &&
                            (arp_fwd_proxy(in_dev, dev, rt) ||
                             arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
-                             pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
+                             pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
-                        {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n)
                                        neigh_release(n);
@@ -896,9 +908,12 @@ static int arp_process(struct sk_buff *skb)
                                if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
                                    skb->pkt_type == PACKET_HOST ||
                                    in_dev->arp_parms->proxy_delay == 0) {
-                                        arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+                                        arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+                                                 dev, tip, sha, dev->dev_addr,
+                                                 sha);
                                } else {
-                                        pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+                                        pneigh_enqueue(&arp_tbl,
+                                                       in_dev->arp_parms, skb);
                                        return 0;
                                }
                                goto out;
@@ -939,7 +954,8 @@ static int arp_process(struct sk_buff *skb)
                if (arp->ar_op != htons(ARPOP_REPLY) ||
                    skb->pkt_type != PACKET_HOST)
                        state = NUD_STALE;
-                neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+                neigh_update(n, sha, state,
+                             override ? NEIGH_UPDATE_F_OVERRIDE : 0);
                neigh_release(n);
        }
@@ -975,7 +991,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
            arp->ar_pln != 4)
                goto freeskb;
-        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+        skb = skb_share_check(skb, GFP_ATOMIC);
+        if (skb == NULL)
                goto out_of_mem;
        memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1018,8 +1035,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
        if (mask && mask != htonl(0xFFFFFFFF))
                return -EINVAL;
        if (!dev && (r->arp_flags & ATF_COM)) {
-                dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
+                dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
-                                r->arp_ha.sa_data);
+                                      r->arp_ha.sa_data);
                if (!dev)
                        return -ENODEV;
        }
@@ -1033,7 +1050,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 }
 static int arp_req_set(struct net *net, struct arpreq *r,
-                struct net_device * dev)
+                       struct net_device *dev)
 {
        __be32 ip;
        struct neighbour *neigh;
@@ -1046,11 +1063,10 @@ static int arp_req_set(struct net *net, struct arpreq *r,
        if (r->arp_flags & ATF_PERM)
                r->arp_flags |= ATF_COM;
        if (dev == NULL) {
-                struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+                struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
-                                                         .tos = RTO_ONLINK } } };
-                struct rtable * rt;
+                if (IS_ERR(rt))
-                if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+                        return PTR_ERR(rt);
-                        return err;
                dev = rt->dst.dev;
                ip_rt_put(rt);
                if (!dev)
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
                unsigned state = NUD_STALE;
                if (r->arp_flags & ATF_PERM)
                        state = NUD_PERMANENT;
-                err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+                err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
                                   r->arp_ha.sa_data : NULL, state,
-                                   NEIGH_UPDATE_F_OVERRIDE|
+                                   NEIGH_UPDATE_F_OVERRIDE |
                                   NEIGH_UPDATE_F_ADMIN);
                neigh_release(neigh);
        }
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 static unsigned arp_state_to_flags(struct neighbour *neigh)
 {
-        unsigned flags = 0;
        if (neigh->nud_state&NUD_PERMANENT)
-                flags = ATF_PERM|ATF_COM;
+                return ATF_PERM | ATF_COM;
        else if (neigh->nud_state&NUD_VALID)
-                flags = ATF_COM;
+                return ATF_COM;
-        return flags;
+        else
+                return 0;
 }
 /*
@@ -1126,6 +1142,23 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
        return err;
 }
+int arp_invalidate(struct net_device *dev, __be32 ip)
+{
+        struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+        int err = -ENXIO;
+        if (neigh) {
+                if (neigh->nud_state & ~NUD_NOARP)
+                        err = neigh_update(neigh, NULL, NUD_FAILED,
+                                           NEIGH_UPDATE_F_OVERRIDE|
+                                           NEIGH_UPDATE_F_ADMIN);
+                neigh_release(neigh);
+        }
+        return err;
+}
+EXPORT_SYMBOL(arp_invalidate);
 static int arp_req_delete_public(struct net *net, struct arpreq *r,
                struct net_device *dev)
 {
@@ -1142,37 +1175,24 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
 }
 static int arp_req_delete(struct net *net, struct arpreq *r,
-                struct net_device * dev)
+                          struct net_device *dev)
 {
-        int err;
        __be32 ip;
-        struct neighbour *neigh;
        if (r->arp_flags & ATF_PUBL)
                return arp_req_delete_public(net, r, dev);
        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
        if (dev == NULL) {
-                struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+                struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
-                                                         .tos = RTO_ONLINK } } };
+                if (IS_ERR(rt))
-                struct rtable * rt;
+                        return PTR_ERR(rt);
-                if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
-                        return err;
                dev = rt->dst.dev;
                ip_rt_put(rt);
                if (!dev)
                        return -EINVAL;
        }
-        err = -ENXIO;
+        return arp_invalidate(dev, ip);
-        neigh = neigh_lookup(&arp_tbl, &ip, dev);
-        if (neigh) {
-                if (neigh->nud_state&~NUD_NOARP)
-                        err = neigh_update(neigh, NULL, NUD_FAILED,
-                                           NEIGH_UPDATE_F_OVERRIDE|
-                                           NEIGH_UPDATE_F_ADMIN);
-                neigh_release(neigh);
-        }
-        return err;
 }
 /*
@@ -1186,24 +1206,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        struct net_device *dev = NULL;
        switch (cmd) {
-                case SIOCDARP:
+        case SIOCDARP:
-                case SIOCSARP:
+        case SIOCSARP:
-                        if (!capable(CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
-                                return -EPERM;
+                        return -EPERM;
-                case SIOCGARP:
+        case SIOCGARP:
-                        err = copy_from_user(&r, arg, sizeof(struct arpreq));
+                err = copy_from_user(&r, arg, sizeof(struct arpreq));
-                        if (err)
+                if (err)
-                                return -EFAULT;
+                        return -EFAULT;
-                        break;
+                break;
-                default:
+        default:
-                        return -EINVAL;
+                return -EINVAL;
        }
        if (r.arp_pa.sa_family != AF_INET)
                return -EPFNOSUPPORT;
        if (!(r.arp_flags & ATF_PUBL) &&
-            (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+            (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
                return -EINVAL;
        if (!(r.arp_flags & ATF_NETMASK))
                ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1231,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        rtnl_lock();
        if (r.arp_dev[0]) {
                err = -ENODEV;
-                if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL)
+                dev = __dev_get_by_name(net, r.arp_dev);
+                if (dev == NULL)
                        goto out;
                /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1234,16 +1255,17 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                break;
        case SIOCGARP:
                err = arp_req_get(&r, dev);
-                if (!err && copy_to_user(arg, &r, sizeof(r)))
-                        err = -EFAULT;
                break;
        }
 out:
        rtnl_unlock();
+        if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+                err = -EFAULT;
        return err;
 }
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+                            void *ptr)
 {
        struct net_device *dev = ptr;
@@ -1311,12 +1333,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
        for (n = 0, s = buf; n < 6; n++) {
                c = (a->ax25_call[n] >> 1) & 0x7F;
-                if (c != ' ') *s++ = c;
+                if (c != ' ')
+                        *s++ = c;
        }
        *s++ = '-';
+        n = (a->ax25_call[6] >> 1) & 0x0F;
-        if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+        if (n > 9) {
                *s++ = '1';
                n -= 10;
        }
@@ -1325,10 +1348,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
        *s++ = '\0';
        if (*buf == '\0' || *buf == '-')
-           return "*";
+                return "*";
        return buf;
 }
 #endif /* CONFIG_AX25 */
@@ -1408,10 +1430,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
 /* ------------------------------------------------------------------------ */
 static const struct seq_operations arp_seq_ops = {
-        .start  = arp_seq_start,
+        .start  = arp_seq_start,
-        .next   = neigh_seq_next,
+        .next   = neigh_seq_next,
-        .stop   = neigh_seq_stop,
+        .stop   = neigh_seq_stop,
-        .show   = arp_seq_show,
+        .show   = arp_seq_show,
 };
 static int arp_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 3a92a76ae41d..2b3c23c287cd 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
 *
 * The CIPSO draft specification can be found in the kernel's Documentation
 * directory as well as the following URL:
- *   http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ *   http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
 * The FIPS-188 specification can be found at the following URL:
 *   http://www.itl.nist.gov/fipspubs/fip188.htm
 *
@@ -112,7 +112,7 @@ int cipso_v4_rbm_strictvalid = 1;
 /* The maximum number of category ranges permitted in the ranged category tag
 * (tag #5).  You may note that the IETF draft states that the maximum number
 * of category ranges is 7, but if the low end of the last category range is
- * zero then it is possibile to fit 8 category ranges because the zero should
+ * zero then it is possible to fit 8 category ranges because the zero should
 * be omitted. */
 #define CIPSO_V4_TAG_RNG_CAT_MAX      8
@@ -438,7 +438,7 @@ cache_add_failure:
 *
 * Description:
 * Search the DOI definition list for a DOI definition with a DOI value that
- * matches @doi.  The caller is responsibile for calling rcu_read_[un]lock().
+ * matches @doi.  The caller is responsible for calling rcu_read_[un]lock().
 * Returns a pointer to the DOI definition on success and NULL on failure.
 */
 static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
@@ -1293,7 +1293,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
                        return ret_val;
                /* This will send packets using the "optimized" format when
-                 * possibile as specified in  section 3.4.2.6 of the
+                 * possible as specified in  section 3.4.2.6 of the
                 * CIPSO draft. */
                if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
                        tag_len = 14;
@@ -1752,7 +1752,7 @@ validate_return:
 }
 /**
- * cipso_v4_error - Send the correct reponse for a bad packet
+ * cipso_v4_error - Send the correct response for a bad packet
 * @skb: the packet
 * @error: the error code
 * @gateway: CIPSO gateway flag
@@ -1857,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
        return CIPSO_V4_HDR_LEN + ret_val;
 }
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+        kfree(container_of(head, struct ip_options_rcu, rcu));
+}
 /**
 * cipso_v4_sock_setattr - Add a CIPSO option to a socket
 * @sk: the socket
@@ -1879,7 +1884,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
        unsigned char *buf = NULL;
        u32 buf_len;
        u32 opt_len;
-        struct ip_options *opt = NULL;
+        struct ip_options_rcu *old, *opt = NULL;
        struct inet_sock *sk_inet;
        struct inet_connection_sock *sk_conn;
@@ -1915,22 +1920,25 @@ int cipso_v4_sock_setattr(struct sock *sk,
                ret_val = -ENOMEM;
                goto socket_setattr_failure;
        }
-        memcpy(opt->__data, buf, buf_len);
+        memcpy(opt->opt.__data, buf, buf_len);
-        opt->optlen = opt_len;
+        opt->opt.optlen = opt_len;
-        opt->cipso = sizeof(struct iphdr);
+        opt->opt.cipso = sizeof(struct iphdr);
        kfree(buf);
        buf = NULL;
        sk_inet = inet_sk(sk);
+        old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
        if (sk_inet->is_icsk) {
                sk_conn = inet_csk(sk);
-                if (sk_inet->opt)
+                if (old)
-                        sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen;
+                        sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
-                sk_conn->icsk_ext_hdr_len += opt->optlen;
+                sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
                sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
        }
-        opt = xchg(&sk_inet->opt, opt);
+        rcu_assign_pointer(sk_inet->inet_opt, opt);
-        kfree(opt);
+        if (old)
+                call_rcu(&old->rcu, opt_kfree_rcu);
        return 0;
@@ -1960,7 +1968,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
        unsigned char *buf = NULL;
        u32 buf_len;
        u32 opt_len;
-        struct ip_options *opt = NULL;
+        struct ip_options_rcu *opt = NULL;
        struct inet_request_sock *req_inet;
        /* We allocate the maximum CIPSO option size here so we are probably
@@ -1988,15 +1996,16 @@ int cipso_v4_req_setattr(struct request_sock *req,
                ret_val = -ENOMEM;
                goto req_setattr_failure;
        }
-        memcpy(opt->__data, buf, buf_len);
+        memcpy(opt->opt.__data, buf, buf_len);
-        opt->optlen = opt_len;
+        opt->opt.optlen = opt_len;
-        opt->cipso = sizeof(struct iphdr);
+        opt->opt.cipso = sizeof(struct iphdr);
        kfree(buf);
        buf = NULL;
        req_inet = inet_rsk(req);
        opt = xchg(&req_inet->opt, opt);
-        kfree(opt);
+        if (opt)
+                call_rcu(&opt->rcu, opt_kfree_rcu);
        return 0;
@@ -2016,34 +2025,34 @@ req_setattr_failure:
 * values on failure.
 *
 */
-static int cipso_v4_delopt(struct ip_options **opt_ptr)
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
 {
        int hdr_delta = 0;
-        struct ip_options *opt = *opt_ptr;
+        struct ip_options_rcu *opt = *opt_ptr;
-        if (opt->srr || opt->rr || opt->ts || opt->router_alert) {
+        if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
                u8 cipso_len;
                u8 cipso_off;
                unsigned char *cipso_ptr;
                int iter;
                int optlen_new;
-                cipso_off = opt->cipso - sizeof(struct iphdr);
+                cipso_off = opt->opt.cipso - sizeof(struct iphdr);
-                cipso_ptr = &opt->__data[cipso_off];
+                cipso_ptr = &opt->opt.__data[cipso_off];
                cipso_len = cipso_ptr[1];
-                if (opt->srr > opt->cipso)
+                if (opt->opt.srr > opt->opt.cipso)
-                        opt->srr -= cipso_len;
+                        opt->opt.srr -= cipso_len;
-                if (opt->rr > opt->cipso)
+                if (opt->opt.rr > opt->opt.cipso)
-                        opt->rr -= cipso_len;
+                        opt->opt.rr -= cipso_len;
-                if (opt->ts > opt->cipso)
+                if (opt->opt.ts > opt->opt.cipso)
-                        opt->ts -= cipso_len;
+                        opt->opt.ts -= cipso_len;
-                if (opt->router_alert > opt->cipso)
+                if (opt->opt.router_alert > opt->opt.cipso)
-                        opt->router_alert -= cipso_len;
+                        opt->opt.router_alert -= cipso_len;
-                opt->cipso = 0;
+                opt->opt.cipso = 0;
                memmove(cipso_ptr, cipso_ptr + cipso_len,
-                        opt->optlen - cipso_off - cipso_len);
+                        opt->opt.optlen - cipso_off - cipso_len);
                /* determining the new total option length is tricky because of
                 * the padding necessary, the only thing i can think to do at
@@ -2052,21 +2061,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
                 * from there we can determine the new total option length */
                iter = 0;
                optlen_new = 0;
-                while (iter < opt->optlen)
+                while (iter < opt->opt.optlen)
-                        if (opt->__data[iter] != IPOPT_NOP) {
+                        if (opt->opt.__data[iter] != IPOPT_NOP) {
-                                iter += opt->__data[iter + 1];
+                                iter += opt->opt.__data[iter + 1];
                                optlen_new = iter;
                        } else
                                iter++;
-                hdr_delta = opt->optlen;
+                hdr_delta = opt->opt.optlen;
-                opt->optlen = (optlen_new + 3) & ~3;
+                opt->opt.optlen = (optlen_new + 3) & ~3;
-                hdr_delta -= opt->optlen;
+                hdr_delta -= opt->opt.optlen;
        } else {
                /* only the cipso option was present on the socket so we can
                 * remove the entire option struct */
                *opt_ptr = NULL;
-                hdr_delta = opt->optlen;
+                hdr_delta = opt->opt.optlen;
-                kfree(opt);
+                call_rcu(&opt->rcu, opt_kfree_rcu);
        }
        return hdr_delta;
@@ -2083,15 +2092,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
 void cipso_v4_sock_delattr(struct sock *sk)
 {
        int hdr_delta;
-        struct ip_options *opt;
+        struct ip_options_rcu *opt;
        struct inet_sock *sk_inet;
        sk_inet = inet_sk(sk);
-        opt = sk_inet->opt;
+        opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
-        if (opt == NULL || opt->cipso == 0)
+        if (opt == NULL || opt->opt.cipso == 0)
                return;
-        hdr_delta = cipso_v4_delopt(&sk_inet->opt);
+        hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
        if (sk_inet->is_icsk && hdr_delta > 0) {
                struct inet_connection_sock *sk_conn = inet_csk(sk);
                sk_conn->icsk_ext_hdr_len -= hdr_delta;
@@ -2109,12 +2118,12 @@ void cipso_v4_sock_delattr(struct sock *sk)
 */
 void cipso_v4_req_delattr(struct request_sock *req)
 {
-        struct ip_options *opt;
+        struct ip_options_rcu *opt;
        struct inet_request_sock *req_inet;
        req_inet = inet_rsk(req);
        opt = req_inet->opt;
-        if (opt == NULL || opt->cipso == 0)
+        if (opt == NULL || opt->opt.cipso == 0)
                return;
        cipso_v4_delopt(&req_inet->opt);
@@ -2184,14 +2193,18 @@ getattr_return:
 */
 int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 {
-        struct ip_options *opt;
+        struct ip_options_rcu *opt;
+        int res = -ENOMSG;
-        opt = inet_sk(sk)->opt;
+        rcu_read_lock();
-        if (opt == NULL || opt->cipso == 0)
+        opt = rcu_dereference(inet_sk(sk)->inet_opt);
-                return -ENOMSG;
+        if (opt && opt->opt.cipso)
+                res = cipso_v4_getattr(opt->opt.__data +
-        return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr),
+                                                opt->opt.cipso -
-                                secattr);
+                                                sizeof(struct iphdr),
+                                       secattr);
+        rcu_read_unlock();
+        return res;
 }
 /**
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 721a8a37b45c..424fafbc8cb0 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
        struct inet_sock *inet = inet_sk(sk);
        struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+        struct flowi4 *fl4;
        struct rtable *rt;
        __be32 saddr;
        int oif;
@@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        sk_dst_reset(sk);
+        lock_sock(sk);
        oif = sk->sk_bound_dev_if;
        saddr = inet->inet_saddr;
        if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -46,33 +49,39 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                if (!saddr)
                        saddr = inet->mc_addr;
        }
-        err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
+        fl4 = &inet->cork.fl.u.ip4;
-                               RT_CONN_FLAGS(sk), oif,
+        rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
-                               sk->sk_protocol,
+                              RT_CONN_FLAGS(sk), oif,
-                               inet->inet_sport, usin->sin_port, sk, 1);
+                              sk->sk_protocol,
-        if (err) {
+                              inet->inet_sport, usin->sin_port, sk, true);
+        if (IS_ERR(rt)) {
+                err = PTR_ERR(rt);
                if (err == -ENETUNREACH)
                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
-                return err;
+                goto out;
        }
        if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
                ip_rt_put(rt);
-                return -EACCES;
+                err = -EACCES;
+                goto out;
        }
        if (!inet->inet_saddr)
-                inet->inet_saddr = rt->rt_src;  /* Update source address */
+                inet->inet_saddr = fl4->saddr;  /* Update source address */
        if (!inet->inet_rcv_saddr) {
-                inet->inet_rcv_saddr = rt->rt_src;
+                inet->inet_rcv_saddr = fl4->saddr;
                if (sk->sk_prot->rehash)
                        sk->sk_prot->rehash(sk);
        }
-        inet->inet_daddr = rt->rt_dst;
+        inet->inet_daddr = fl4->daddr;
        inet->inet_dport = usin->sin_port;
        sk->sk_state = TCP_ESTABLISHED;
        inet->inet_id = jiffies;
        sk_dst_set(sk, &rt->dst);
-        return(0);
+        err = 0;
+out:
+        release_sock(sk);
+        return err;
 }
 EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f4..0d4a184af16f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -63,6 +64,8 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
+#include "fib_lookup.h"
 static struct ipv4_devconf ipv4_devconf = {
        .data = {
                [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
@@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
        [IFA_LABEL]             = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 };
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value.  So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE  256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+{
+        u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+        return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+                (IN4_ADDR_HSIZE - 1));
+}
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+        unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
+        spin_lock(&inet_addr_hash_lock);
+        hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+        spin_unlock(&inet_addr_hash_lock);
+}
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+        spin_lock(&inet_addr_hash_lock);
+        hlist_del_init_rcu(&ifa->hash);
+        spin_unlock(&inet_addr_hash_lock);
+}
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+        unsigned int hash = inet_addr_hash(net, addr);
+        struct net_device *result = NULL;
+        struct in_ifaddr *ifa;
+        struct hlist_node *node;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+                struct net_device *dev = ifa->ifa_dev->dev;
+                if (!net_eq(dev_net(dev), net))
+                        continue;
+                if (ifa->ifa_local == addr) {
+                        result = dev;
+                        break;
+                }
+        }
+        if (!result) {
+                struct flowi4 fl4 = { .daddr = addr };
+                struct fib_result res = { 0 };
+                struct fib_table *local;
+                /* Fallback to FIB local table so that communication
+                 * over loopback subnets work.
+                 */
+                local = fib_get_table(net, RT_TABLE_LOCAL);
+                if (local &&
+                    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+                    res.type == RTN_LOCAL)
+                        result = FIB_RES_DEV(res);
+        }
+        if (result && devref)
+                dev_hold(result);
+        rcu_read_unlock();
+        return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
 static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -209,7 +291,7 @@ static void inetdev_destroy(struct in_device *in_dev)
                inet_free_ifa(ifa);
        }
-        dev->ip_ptr = NULL;
+        rcu_assign_pointer(dev->ip_ptr, NULL);
        devinet_sysctl_unregister(in_dev);
        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                        }
                        if (!do_promote) {
+                                inet_hash_remove(ifa);
                                *ifap1 = ifa->ifa_next;
                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                }
        }
+        /* On promotion all secondaries from subnet are changing
+         * the primary IP, we must remove all their routes silently
+         * and later to add them back with new prefsrc. Do this
+         * while all addresses are on the device list.
+         */
+        for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+                if (ifa1->ifa_mask == ifa->ifa_mask &&
+                    inet_ifa_match(ifa1->ifa_address, ifa))
+                        fib_del_ifaddr(ifa, ifa1);
+        }
        /* 2. Unlink it */
        *ifap = ifa1->ifa_next;
+        inet_hash_remove(ifa1);
        /* 3. Announce address deletion */
@@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
        if (promote) {
+                struct in_ifaddr *next_sec = promote->ifa_next;
                if (prev_prom) {
                        prev_prom->ifa_next = promote->ifa_next;
@@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
                blocking_notifier_call_chain(&inetaddr_chain,
                                NETDEV_UP, promote);
-                for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
+                for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
                        if (ifa1->ifa_mask != ifa->ifa_mask ||
                            !inet_ifa_match(ifa1->ifa_address, ifa))
                                        continue;
@@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
        ifa->ifa_next = *ifap;
        *ifap = ifa;
+        inet_hash_insert(dev_net(in_dev->dev), ifa);
        /* Send message first, then call notifier.
           Notifier will trigger FIB update, so that
           listeners of netlink will know about new ifaddr */
@@ -403,6 +501,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
        return inet_insert_ifa(ifa);
 }
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
 struct in_device *inetdev_by_index(struct net *net, int ifindex)
 {
        struct net_device *dev;
@@ -411,7 +512,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
-                in_dev = in_dev_get(dev);
+                in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        rcu_read_unlock();
        return in_dev;
 }
@@ -453,8 +554,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
                goto errout;
        }
-        __in_dev_put(in_dev);
        for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
             ifap = &ifa->ifa_next) {
                if (tb[IFA_LOCAL] &&
@@ -520,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
        if (tb[IFA_ADDRESS] == NULL)
                tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+        INIT_HLIST_NODE(&ifa->hash);
        ifa->ifa_prefixlen = ifm->ifa_prefixlen;
        ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
        ifa->ifa_flags = ifm->ifa_flags;
@@ -669,7 +769,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                             ifap = &ifa->ifa_next) {
                                if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
                                    sin_orig.sin_addr.s_addr ==
-                                                        ifa->ifa_address) {
+                                                        ifa->ifa_local) {
                                        break; /* found */
                                }
                        }
@@ -727,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                if (!ifa) {
                        ret = -ENOBUFS;
                        ifa = inet_alloc_ifa();
+                        INIT_HLIST_NODE(&ifa->hash);
                        if (!ifa)
                                break;
                        if (colon)
@@ -1029,6 +1130,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu)
        return mtu >= 68;
 }
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+                                        struct in_device *in_dev)
+{
+        struct in_ifaddr *ifa = in_dev->ifa_list;
+        if (!ifa)
+                return;
+        arp_send(ARPOP_REQUEST, ETH_P_ARP,
+                 ifa->ifa_local, dev,
+                 ifa->ifa_local, NULL,
+                 dev->dev_addr, NULL);
+}
 /* Called only under RTNL semaphore */
 static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1059,7 +1175,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
        switch (event) {
        case NETDEV_REGISTER:
                printk(KERN_DEBUG "inetdev_event: bug\n");
-                dev->ip_ptr = NULL;
+                rcu_assign_pointer(dev->ip_ptr, NULL);
                break;
        case NETDEV_UP:
                if (!inetdev_valid_mtu(dev->mtu))
@@ -1068,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
                        struct in_ifaddr *ifa = inet_alloc_ifa();
                        if (ifa) {
+                                INIT_HLIST_NODE(&ifa->hash);
                                ifa->ifa_local =
                                  ifa->ifa_address = htonl(INADDR_LOOPBACK);
                                ifa->ifa_prefixlen = 8;
@@ -1081,18 +1198,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
                }
                ip_mc_up(in_dev);
                /* fall through */
-        case NETDEV_NOTIFY_PEERS:
        case NETDEV_CHANGEADDR:
+                if (!IN_DEV_ARP_NOTIFY(in_dev))
+                        break;
+                /* fall through */
+        case NETDEV_NOTIFY_PEERS:
                /* Send gratuitous ARP to notify of link change */
-                if (IN_DEV_ARP_NOTIFY(in_dev)) {
+                inetdev_send_gratuitous_arp(dev, in_dev);
-                        struct in_ifaddr *ifa = in_dev->ifa_list;
-                        if (ifa)
-                                arp_send(ARPOP_REQUEST, ETH_P_ARP,
-                                         ifa->ifa_address, dev,
-                                         ifa->ifa_address, NULL,
-                                         dev->dev_addr, NULL);
-                }
                break;
        case NETDEV_DOWN:
                ip_mc_down(in_dev);
@@ -1255,6 +1367,87 @@ errout:
                rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
 }
+static size_t inet_get_link_af_size(const struct net_device *dev)
+{
+        struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+        if (!in_dev)
+                return 0;
+        return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+        struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+        struct nlattr *nla;
+        int i;
+        if (!in_dev)
+                return -ENODATA;
+        nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+        if (nla == NULL)
+                return -EMSGSIZE;
+        for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+                ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+        return 0;
+}
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+        [IFLA_INET_CONF]        = { .type = NLA_NESTED },
+};
+static int inet_validate_link_af(const struct net_device *dev,
+                                 const struct nlattr *nla)
+{
+        struct nlattr *a, *tb[IFLA_INET_MAX+1];
+        int err, rem;
+        if (dev && !__in_dev_get_rtnl(dev))
+                return -EAFNOSUPPORT;
+        err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+        if (err < 0)
+                return err;
+        if (tb[IFLA_INET_CONF]) {
+                nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+                        int cfgid = nla_type(a);
+                        if (nla_len(a) < 4)
+                                return -EINVAL;
+                        if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+                                return -EINVAL;
+                }
+        }
+        return 0;
+}
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+        struct in_device *in_dev = __in_dev_get_rtnl(dev);
+        struct nlattr *a, *tb[IFLA_INET_MAX+1];
+        int rem;
+        if (!in_dev)
+                return -EAFNOSUPPORT;
+        if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+                BUG();
+        if (tb[IFLA_INET_CONF]) {
+                nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+                        ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+        }
+        return 0;
+}
 #ifdef CONFIG_SYSCTL
 static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1348,9 +1541,9 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
        return ret;
 }
-int ipv4_doint_and_flush(ctl_table *ctl, int write,
+static int ipv4_doint_and_flush(ctl_table *ctl, int write,
-                         void __user *buffer,
+                                void __user *buffer,
-                         size_t *lenp, loff_t *ppos)
+                                size_t *lenp, loff_t *ppos)
 {
        int *valp = ctl->data;
        int val = *valp;
@@ -1487,7 +1680,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
                return;
        cnf->sysctl = NULL;
-        unregister_sysctl_table(t->sysctl_header);
+        unregister_net_sysctl_table(t->sysctl_header);
        kfree(t->dev_name);
        kfree(t);
 }
@@ -1618,13 +1811,28 @@ static __net_initdata struct pernet_operations devinet_ops = {
        .exit = devinet_exit_net,
 };
+static struct rtnl_af_ops inet_af_ops = {
+        .family           = AF_INET,
+        .fill_link_af     = inet_fill_link_af,
+        .get_link_af_size = inet_get_link_af_size,
+        .validate_link_af = inet_validate_link_af,
+        .set_link_af      = inet_set_link_af,
+};
 void __init devinet_init(void)
 {
+        int i;
+        for (i = 0; i < IN4_ADDR_HSIZE; i++)
+                INIT_HLIST_HEAD(&inet_addr_lst[i]);
        register_pernet_subsys(&devinet_ops);
        register_gifconf(PF_INET, inet_gifconf);
        register_netdevice_notifier(&ip_netdev_notifier);
+        rtnl_af_register(&inet_af_ops);
        rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
        rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
        rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1c3fb0..a5b413416da3 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
 /*
 * Allocate an AEAD request structure with extra space for SG and IV.
 *
@@ -31,11 +33,14 @@ struct esp_skb_cb {
 *
 * TODO: Use spare space in skb for this where possible.
 */
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
 {
        unsigned int len;
-        len = crypto_aead_ivsize(aead);
+        len = seqhilen;
+        len += crypto_aead_ivsize(aead);
        if (len) {
                len += crypto_aead_alignmask(aead) &
                       ~(crypto_tfm_ctx_alignment() - 1);
@@ -50,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
        return kmalloc(len, GFP_ATOMIC);
 }
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp)
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+        return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
 {
        return crypto_aead_ivsize(aead) ?
-               PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp;
+               PTR_ALIGN((u8 *)tmp + seqhilen,
+                         crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
 }
 static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -117,46 +127,75 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
        int blksize;
        int clen;
        int alen;
+        int plen;
+        int tfclen;
        int nfrags;
+        int assoclen;
+        int sglists;
+        int seqhilen;
+        __be32 *seqhi;
        /* skb is pure payload to encrypt */
        err = -ENOMEM;
-        /* Round to block size */
-        clen = skb->len;
        esp = x->data;
        aead = esp->aead;
        alen = crypto_aead_authsize(aead);
+        tfclen = 0;
+        if (x->tfcpad) {
+                struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+                u32 padto;
+                padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+                if (skb->len < padto)
+                        tfclen = padto - skb->len;
+        }
        blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-        clen = ALIGN(clen + 2, blksize);
+        clen = ALIGN(skb->len + 2 + tfclen, blksize);
        if (esp->padlen)
                clen = ALIGN(clen, esp->padlen);
+        plen = clen - skb->len - tfclen;
-        if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0)
+        err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+        if (err < 0)
                goto error;
        nfrags = err;
-        tmp = esp_alloc_tmp(aead, nfrags + 1);
+        assoclen = sizeof(*esph);
+        sglists = 1;
+        seqhilen = 0;
+        if (x->props.flags & XFRM_STATE_ESN) {
+                sglists += 2;
+                seqhilen += sizeof(__be32);
+                assoclen += seqhilen;
+        }
+        tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
        if (!tmp)
                goto error;
-        iv = esp_tmp_iv(aead, tmp);
+        seqhi = esp_tmp_seqhi(tmp);
+        iv = esp_tmp_iv(aead, tmp, seqhilen);
        req = esp_tmp_givreq(aead, iv);
        asg = esp_givreq_sg(aead, req);
-        sg = asg + 1;
+        sg = asg + sglists;
        /* Fill padding... */
        tail = skb_tail_pointer(trailer);
+        if (tfclen) {
+                memset(tail, 0, tfclen);
+                tail += tfclen;
+        }
        do {
                int i;
-                for (i=0; i<clen-skb->len - 2; i++)
+                for (i = 0; i < plen - 2; i++)
                        tail[i] = i + 1;
        } while (0);
-        tail[clen - skb->len - 2] = (clen - skb->len) - 2;
+        tail[plen - 2] = plen - 2;
-        tail[clen - skb->len - 1] = *skb_mac_header(skb);
+        tail[plen - 1] = *skb_mac_header(skb);
        pskb_put(skb, trailer, clen - skb->len + alen);
        skb_push(skb, -skb_network_offset(skb));
@@ -199,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
        }
        esph->spi = x->id.spi;
-        esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
+        esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
        sg_init_table(sg, nfrags);
        skb_to_sgvec(skb, sg,
                     esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
                     clen + alen);
-        sg_init_one(asg, esph, sizeof(*esph));
+        if ((x->props.flags & XFRM_STATE_ESN)) {
+                sg_init_table(asg, 3);
+                sg_set_buf(asg, &esph->spi, sizeof(__be32));
+                *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+                sg_set_buf(asg + 1, seqhi, seqhilen);
+                sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+        } else
+                sg_init_one(asg, esph, sizeof(*esph));
        aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
        aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
-        aead_givcrypt_set_assoc(req, asg, sizeof(*esph));
+        aead_givcrypt_set_assoc(req, asg, assoclen);
        aead_givcrypt_set_giv(req, esph->enc_data,
-                              XFRM_SKB_CB(skb)->seq.output);
+                              XFRM_SKB_CB(skb)->seq.output.low);
        ESP_SKB_CB(skb)->tmp = tmp;
        err = crypto_aead_givencrypt(req);
@@ -229,7 +276,7 @@ error:
 static int esp_input_done2(struct sk_buff *skb, int err)
 {
-        struct iphdr *iph;
+        const struct iphdr *iph;
        struct xfrm_state *x = xfrm_input_state(skb);
        struct esp_data *esp = x->data;
        struct crypto_aead *aead = esp->aead;
@@ -330,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
        struct sk_buff *trailer;
        int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
        int nfrags;
+        int assoclen;
+        int sglists;
+        int seqhilen;
+        __be32 *seqhi;
        void *tmp;
        u8 *iv;
        struct scatterlist *sg;
@@ -346,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
                goto out;
        nfrags = err;
+        assoclen = sizeof(*esph);
+        sglists = 1;
+        seqhilen = 0;
+        if (x->props.flags & XFRM_STATE_ESN) {
+                sglists += 2;
+                seqhilen += sizeof(__be32);
+                assoclen += seqhilen;
+        }
        err = -ENOMEM;
-        tmp = esp_alloc_tmp(aead, nfrags + 1);
+        tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
        if (!tmp)
                goto out;
        ESP_SKB_CB(skb)->tmp = tmp;
-        iv = esp_tmp_iv(aead, tmp);
+        seqhi = esp_tmp_seqhi(tmp);
+        iv = esp_tmp_iv(aead, tmp, seqhilen);
        req = esp_tmp_req(aead, iv);
        asg = esp_req_sg(aead, req);
-        sg = asg + 1;
+        sg = asg + sglists;
        skb->ip_summed = CHECKSUM_NONE;
@@ -366,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
        sg_init_table(sg, nfrags);
        skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
-        sg_init_one(asg, esph, sizeof(*esph));
+        if ((x->props.flags & XFRM_STATE_ESN)) {
+                sg_init_table(asg, 3);
+                sg_set_buf(asg, &esph->spi, sizeof(__be32));
+                *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+                sg_set_buf(asg + 1, seqhi, seqhilen);
+                sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+        } else
+                sg_init_one(asg, esph, sizeof(*esph));
        aead_request_set_callback(req, 0, esp_input_done, skb);
        aead_request_set_crypt(req, sg, sg, elen, iv);
-        aead_request_set_assoc(req, asg, sizeof(*esph));
+        aead_request_set_assoc(req, asg, assoclen);
        err = crypto_aead_decrypt(req);
        if (err == -EINPROGRESS)
@@ -414,7 +484,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
 static void esp4_err(struct sk_buff *skb, u32 info)
 {
        struct net *net = dev_net(skb->dev);
-        struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
@@ -422,7 +492,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                return;
-        x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+                              esph->spi, IPPROTO_ESP, AF_INET);
        if (!x)
                return;
        NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -484,10 +555,20 @@ static int esp_init_authenc(struct xfrm_state *x)
                goto error;
        err = -ENAMETOOLONG;
-        if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)",
-                     x->aalg ? x->aalg->alg_name : "digest_null",
+        if ((x->props.flags & XFRM_STATE_ESN)) {
-                     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+                if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
-                goto error;
+                             "authencesn(%s,%s)",
+                             x->aalg ? x->aalg->alg_name : "digest_null",
+                             x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+                        goto error;
+        } else {
+                if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+                             "authenc(%s,%s)",
+                             x->aalg ? x->aalg->alg_name : "digest_null",
+                             x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+                        goto error;
+        }
        aead = crypto_alloc_aead(authenc_name, 0, 0);
        err = PTR_ERR(aead);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7d02a9f999fa..22524716fe70 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -44,6 +44,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/rtnetlink.h>
+#include <net/xfrm.h>
 #ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -51,11 +52,11 @@ static int __net_init fib4_rules_init(struct net *net)
 {
        struct fib_table *local_table, *main_table;
-        local_table = fib_hash_table(RT_TABLE_LOCAL);
+        local_table = fib_trie_table(RT_TABLE_LOCAL);
        if (local_table == NULL)
                return -ENOMEM;
-        main_table  = fib_hash_table(RT_TABLE_MAIN);
+        main_table  = fib_trie_table(RT_TABLE_MAIN);
        if (main_table == NULL)
                goto fail;
@@ -82,7 +83,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
        if (tb)
                return tb;
-        tb = fib_hash_table(id);
+        tb = fib_trie_table(id);
        if (!tb)
                return NULL;
        h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +115,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
 }
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
-void fib_select_default(struct net *net,
-                        const struct flowi *flp, struct fib_result *res)
-{
-        struct fib_table *tb;
-        int table = RT_TABLE_MAIN;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-        if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
-                return;
-        table = res->r->table;
-#endif
-        tb = fib_get_table(net, table);
-        if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-                fib_table_select_default(tb, flp, res);
-}
 static void fib_flush(struct net *net)
 {
        int flushed = 0;
@@ -148,36 +134,6 @@ static void fib_flush(struct net *net)
 }
 /*
- *      Find the first device with a given source address.
- */
-struct net_device * ip_dev_find(struct net *net, __be32 addr)
-{
-        struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
-        struct fib_result res;
-        struct net_device *dev = NULL;
-        struct fib_table *local_table;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-        res.r = NULL;
-#endif
-        local_table = fib_get_table(net, RT_TABLE_LOCAL);
-        if (!local_table || fib_table_lookup(local_table, &fl, &res))
-                return NULL;
-        if (res.type != RTN_LOCAL)
-                goto out;
-        dev = FIB_RES_DEV(res);
-        if (dev)
-                dev_hold(dev);
-out:
-        fib_res_put(&res);
-        return dev;
-}
-EXPORT_SYMBOL(ip_dev_find);
-/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
@@ -185,7 +141,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
                                            const struct net_device *dev,
                                            __be32 addr)
 {
-        struct flowi            fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+        struct flowi4           fl4 = { .daddr = addr };
        struct fib_result       res;
        unsigned ret = RTN_BROADCAST;
        struct fib_table *local_table;
@@ -202,11 +158,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
        local_table = fib_get_table(net, RT_TABLE_LOCAL);
        if (local_table) {
                ret = RTN_UNICAST;
-                if (!fib_table_lookup(local_table, &fl, &res)) {
+                rcu_read_lock();
+                if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
                        if (!dev || dev == res.fi->fib_dev)
                                ret = res.type;
-                        fib_res_put(&res);
                }
+                rcu_read_unlock();
        }
        return ret;
 }
@@ -220,59 +177,60 @@ EXPORT_SYMBOL(inet_addr_type);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
                                __be32 addr)
 {
-       return __inet_dev_addr_type(net, dev, addr);
+        return __inet_dev_addr_type(net, dev, addr);
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 /* Given (packet source, input interface) and optional (dst, oif, tos):
-   - (main) check, that source is valid i.e. not broadcast or our local
+ * - (main) check, that source is valid i.e. not broadcast or our local
-     address.
+ *   address.
-   - figure out what "logical" interface this packet arrived
+ * - figure out what "logical" interface this packet arrived
-     and calculate "specific destination" address.
+ *   and calculate "specific destination" address.
-   - check, that packet arrived from expected physical interface.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
 */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
-int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
+                        int oif, struct net_device *dev, __be32 *spec_dst,
-                        struct net_device *dev, __be32 *spec_dst,
+                        u32 *itag)
-                        u32 *itag, u32 mark)
 {
        struct in_device *in_dev;
-        struct flowi fl = { .nl_u = { .ip4_u =
+        struct flowi4 fl4;
-                                      { .daddr = src,
-                                        .saddr = dst,
-                                        .tos = tos } },
-                            .mark = mark,
-                            .iif = oif };
        struct fib_result res;
        int no_addr, rpf, accept_local;
        bool dev_match;
        int ret;
        struct net *net;
+        fl4.flowi4_oif = 0;
+        fl4.flowi4_iif = oif;
+        fl4.daddr = src;
+        fl4.saddr = dst;
+        fl4.flowi4_tos = tos;
+        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        no_addr = rpf = accept_local = 0;
-        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (in_dev) {
                no_addr = in_dev->ifa_list == NULL;
-                rpf = IN_DEV_RPFILTER(in_dev);
+                /* Ignore rp_filter for packets protected by IPsec. */
+                rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
                accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
-                if (mark && !IN_DEV_SRC_VMARK(in_dev))
+                fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-                        fl.mark = 0;
        }
-        rcu_read_unlock();
        if (in_dev == NULL)
                goto e_inval;
        net = dev_net(dev);
-        if (fib_lookup(net, &fl, &res))
+        if (fib_lookup(net, &fl4, &res))
                goto last_resort;
        if (res.type != RTN_UNICAST) {
                if (res.type != RTN_LOCAL || !accept_local)
-                        goto e_inval_res;
+                        goto e_inval;
        }
-        *spec_dst = FIB_RES_PREFSRC(res);
+        *spec_dst = FIB_RES_PREFSRC(net, res);
        fib_combine_itag(itag, &res);
        dev_match = false;
@@ -291,23 +249,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 #endif
        if (dev_match) {
                ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
-                fib_res_put(&res);
                return ret;
        }
-        fib_res_put(&res);
        if (no_addr)
                goto last_resort;
        if (rpf == 1)
                goto e_rpf;
-        fl.oif = dev->ifindex;
+        fl4.flowi4_oif = dev->ifindex;
        ret = 0;
-        if (fib_lookup(net, &fl, &res) == 0) {
+        if (fib_lookup(net, &fl4, &res) == 0) {
                if (res.type == RTN_UNICAST) {
-                        *spec_dst = FIB_RES_PREFSRC(res);
+                        *spec_dst = FIB_RES_PREFSRC(net, res);
                        ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
                }
-                fib_res_put(&res);
        }
        return ret;
@@ -318,8 +273,6 @@ last_resort:
        *itag = 0;
        return 0;
-e_inval_res:
-        fib_res_put(&res);
 e_inval:
        return -EINVAL;
 e_rpf:
@@ -472,9 +425,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 }
 /*
- *      Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
 */
 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 {
        struct fib_config cfg;
@@ -518,7 +471,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        return -EINVAL;
 }
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
        [RTA_DST]               = { .type = NLA_U32 },
        [RTA_SRC]               = { .type = NLA_U32 },
        [RTA_IIF]               = { .type = NLA_U32 },
@@ -532,7 +485,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
 };
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
-                            struct nlmsghdr *nlh, struct fib_config *cfg)
+                             struct nlmsghdr *nlh, struct fib_config *cfg)
 {
        struct nlattr *attr;
        int err, remaining;
@@ -687,12 +640,11 @@ out:
 }
 /* Prepare and feed intra-kernel routing request.
-   Really, it should be netlink message, but :-( netlink
+ * Really, it should be netlink message, but :-( netlink
-   can be not configured, so that we feed it directly
+ * can be not configured, so that we feed it directly
-   to fib engine. It is legal, because all events occur
+ * to fib engine. It is legal, because all events occur
-   only when netlink is already locked.
+ * only when netlink is already locked.
 */
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 {
        struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -738,9 +690,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
        struct in_ifaddr *prim = ifa;
        __be32 mask = ifa->ifa_mask;
        __be32 addr = ifa->ifa_local;
-        __be32 prefix = ifa->ifa_address&mask;
+        __be32 prefix = ifa->ifa_address & mask;
-        if (ifa->ifa_flags&IFA_F_SECONDARY) {
+        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, prefix, mask);
                if (prim == NULL) {
                        printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -750,58 +702,118 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
-        if (!(dev->flags&IFF_UP))
+        if (!(dev->flags & IFF_UP))
                return;
        /* Add broadcast address, if it is explicitly assigned. */
        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
-        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
            (prefix != addr || ifa->ifa_prefixlen < 32)) {
-                fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+                fib_magic(RTM_NEWROUTE,
-                          RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+                          dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+                          prefix, ifa->ifa_prefixlen, prim);
                /* Add network specific broadcasts, when it takes a sense */
                if (ifa->ifa_prefixlen < 31) {
                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
-                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+                                  32, prim);
                }
        }
 }
-static void fib_del_ifaddr(struct in_ifaddr *ifa)
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 {
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;
        struct in_ifaddr *ifa1;
-        struct in_ifaddr *prim = ifa;
+        struct in_ifaddr *prim = ifa, *prim1 = NULL;
-        __be32 brd = ifa->ifa_address|~ifa->ifa_mask;
+        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
-        __be32 any = ifa->ifa_address&ifa->ifa_mask;
+        __be32 any = ifa->ifa_address & ifa->ifa_mask;
 #define LOCAL_OK        1
 #define BRD_OK          2
 #define BRD0_OK         4
 #define BRD1_OK         8
        unsigned ok = 0;
+        int subnet = 0;         /* Primary network */
+        int gone = 1;           /* Address is missing */
+        int same_prefsrc = 0;   /* Another primary with same IP */
-        if (!(ifa->ifa_flags&IFA_F_SECONDARY))
+        if (ifa->ifa_flags & IFA_F_SECONDARY) {
-                fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-                          RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
-        else {
                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
                if (prim == NULL) {
                        printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
                        return;
                }
+                if (iprim && iprim != prim) {
+                        printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
+                        return;
+                }
+        } else if (!ipv4_is_zeronet(any) &&
+                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+                fib_magic(RTM_DELROUTE,
+                          dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+                          any, ifa->ifa_prefixlen, prim);
+                subnet = 1;
        }
        /* Deletion is more complicated than add.
-           We should take care of not to delete too much :-)
+         * We should take care of not to delete too much :-)
+         *
-           Scan address list to be sure that addresses are really gone.
+         * Scan address list to be sure that addresses are really gone.
         */
        for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+                if (ifa1 == ifa) {
+                        /* promotion, keep the IP */
+                        gone = 0;
+                        continue;
+                }
+                /* Ignore IFAs from our subnet */
+                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+                    inet_ifa_match(ifa1->ifa_address, iprim))
+                        continue;
+                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+                        /* Another address from our subnet? */
+                        if (ifa1->ifa_mask == prim->ifa_mask &&
+                            inet_ifa_match(ifa1->ifa_address, prim))
+                                prim1 = prim;
+                        else {
+                                /* We reached the secondaries, so
+                                 * same_prefsrc should be determined.
+                                 */
+                                if (!same_prefsrc)
+                                        continue;
+                                /* Search new prim1 if ifa1 is not
+                                 * using the current prim1
+                                 */
+                                if (!prim1 ||
+                                    ifa1->ifa_mask != prim1->ifa_mask ||
+                                    !inet_ifa_match(ifa1->ifa_address, prim1))
+                                        prim1 = inet_ifa_byprefix(in_dev,
+                                                        ifa1->ifa_address,
+                                                        ifa1->ifa_mask);
+                                if (!prim1)
+                                        continue;
+                                if (prim1->ifa_local != prim->ifa_local)
+                                        continue;
+                        }
+                } else {
+                        if (prim->ifa_local != ifa1->ifa_local)
+                                continue;
+                        prim1 = ifa1;
+                        if (prim != prim1)
+                                same_prefsrc = 1;
+                }
                if (ifa->ifa_local == ifa1->ifa_local)
                        ok |= LOCAL_OK;
                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -810,25 +822,43 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
                        ok |= BRD1_OK;
                if (any == ifa1->ifa_broadcast)
                        ok |= BRD0_OK;
+                /* primary has network specific broadcasts */
+                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+                        if (!ipv4_is_zeronet(any1)) {
+                                if (ifa->ifa_broadcast == brd1 ||
+                                    ifa->ifa_broadcast == any1)
+                                        ok |= BRD_OK;
+                                if (brd == brd1 || brd == any1)
+                                        ok |= BRD1_OK;
+                                if (any == brd1 || any == any1)
+                                        ok |= BRD0_OK;
+                        }
+                }
        }
-        if (!(ok&BRD_OK))
+        if (!(ok & BRD_OK))
                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
-        if (!(ok&BRD1_OK))
+        if (subnet && ifa->ifa_prefixlen < 31) {
-                fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+                if (!(ok & BRD1_OK))
-        if (!(ok&BRD0_OK))
+                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
-                fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+                if (!(ok & BRD0_OK))
-        if (!(ok&LOCAL_OK)) {
+                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+        }
+        if (!(ok & LOCAL_OK)) {
                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
                /* Check, that this local address finally disappeared. */
-                if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+                if (gone &&
+                    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
                        /* And the last, but not the least thing.
-                           We must flush stray FIB entries.
+                         * We must flush stray FIB entries.
+                         *
-                           First of all, we scan fib_info list searching
+                         * First of all, we scan fib_info list searching
-                           for stray nexthop entries, then ignite fib_flush.
+                         * for stray nexthop entries, then ignite fib_flush.
-                        */
+                         */
                        if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
                                fib_flush(dev_net(dev));
                }
@@ -839,14 +869,16 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 {
        struct fib_result       res;
-        struct flowi            fl = { .mark = frn->fl_mark,
+        struct flowi4           fl4 = {
-                                       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
+                .flowi4_mark = frn->fl_mark,
-                                                            .tos = frn->fl_tos,
+                .daddr = frn->fl_addr,
-                                                            .scope = frn->fl_scope } } };
+                .flowi4_tos = frn->fl_tos,
+                .flowi4_scope = frn->fl_scope,
+        };
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        res.r = NULL;
@@ -857,15 +889,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
                local_bh_disable();
                frn->tb_id = tb->tb_id;
-                frn->err = fib_table_lookup(tb, &fl, &res);
+                rcu_read_lock();
+                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
                if (!frn->err) {
                        frn->prefixlen = res.prefixlen;
                        frn->nh_sel = res.nh_sel;
                        frn->type = res.type;
                        frn->scope = res.scope;
-                        fib_res_put(&res);
                }
+                rcu_read_unlock();
                local_bh_enable();
        }
 }
@@ -894,8 +927,8 @@ static void nl_fib_input(struct sk_buff *skb)
        nl_fib_lookup(frn, tb);
-        pid = NETLINK_CB(skb).pid;       /* pid of sending process */
+        pid = NETLINK_CB(skb).pid;      /* pid of sending process */
-        NETLINK_CB(skb).pid = 0;         /* from kernel */
+        NETLINK_CB(skb).pid = 0;        /* from kernel */
        NETLINK_CB(skb).dst_group = 0;  /* unicast */
        netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
 }
@@ -929,6 +962,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 {
        struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
        struct net_device *dev = ifa->ifa_dev->dev;
+        struct net *net = dev_net(dev);
        switch (event) {
        case NETDEV_UP:
@@ -936,13 +970,15 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
                fib_sync_up(dev);
 #endif
+                atomic_inc(&net->ipv4.dev_addr_genid);
                rt_cache_flush(dev_net(dev), -1);
                break;
        case NETDEV_DOWN:
-                fib_del_ifaddr(ifa);
+                fib_del_ifaddr(ifa, NULL);
+                atomic_inc(&net->ipv4.dev_addr_genid);
                if (ifa->ifa_dev->ifa_list == NULL) {
                        /* Last address was deleted from this interface.
-                           Disable IP.
+                         * Disable IP.
                         */
                        fib_disable_ip(dev, 1, 0);
                } else {
@@ -957,6 +993,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 {
        struct net_device *dev = ptr;
        struct in_device *in_dev = __in_dev_get_rtnl(dev);
+        struct net *net = dev_net(dev);
        if (event == NETDEV_UNREGISTER) {
                fib_disable_ip(dev, 2, -1);
@@ -974,6 +1011,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
                fib_sync_up(dev);
 #endif
+                atomic_inc(&net->ipv4.dev_addr_genid);
                rt_cache_flush(dev_net(dev), -1);
                break;
        case NETDEV_DOWN:
@@ -984,7 +1022,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                rt_cache_flush(dev_net(dev), 0);
                break;
        case NETDEV_UNREGISTER_BATCH:
-                rt_cache_flush_batch();
+                /* The batch unregister is only called on the first
+                 * device in the list of devices being unregistered.
+                 * Therefore we should not pass dev_net(dev) in here.
+                 */
+                rt_cache_flush_batch(NULL);
                break;
        }
        return NOTIFY_DONE;
@@ -1001,16 +1043,15 @@ static struct notifier_block fib_netdev_notifier = {
 static int __net_init ip_fib_net_init(struct net *net)
 {
        int err;
-        unsigned int i;
+        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+        /* Avoid false sharing : Use at least a full cache line */
+        size = max_t(size_t, size, L1_CACHE_BYTES);
-        net->ipv4.fib_table_hash = kzalloc(
+        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
-                        sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
        if (net->ipv4.fib_table_hash == NULL)
                return -ENOMEM;
-        for (i = 0; i < FIB_TABLE_HASHSZ; i++)
-                INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
        err = fib4_rules_init(net);
        if (err < 0)
                goto fail;
@@ -1029,6 +1070,7 @@ static void ip_fib_net_exit(struct net *net)
        fib4_rules_exit(net);
 #endif
+        rtnl_lock();
        for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
                struct fib_table *tb;
                struct hlist_head *head;
@@ -1038,9 +1080,10 @@ static void ip_fib_net_exit(struct net *net)
                hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
                        hlist_del(node);
                        fib_table_flush(tb);
-                        kfree(tb);
+                        fib_free_table(tb);
                }
        }
+        rtnl_unlock();
        kfree(net->ipv4.fib_table_hash);
 }
@@ -1089,5 +1132,5 @@ void __init ip_fib_init(void)
        register_netdevice_notifier(&fib_netdev_notifier);
        register_inetaddr_notifier(&fib_inetaddr_notifier);
-        fib_hash_init();
+        fib_trie_init();
 }
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index 4ed7e0dea1bc..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1070 +0,0 @@
-/*
- * INET         An implementation of the TCP/IP protocol suite for the LINUX
- *              operating system.  INET is implemented using the  BSD Socket
- *              interface as the means of communication with the user level.
- *
- *              IPv4 FIB: lookup engine and maintenance routines.
- *
- * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- *              This program is free software; you can redistribute it and/or
- *              modify it under the terms of the GNU General Public License
- *              as published by the Free Software Foundation; either version
- *              2 of the License, or (at your option) any later version.
- */
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-#include "fib_lookup.h"
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-struct fib_node {
-        struct hlist_node       fn_hash;
-        struct list_head        fn_alias;
-        __be32                  fn_key;
-        struct fib_alias        fn_embedded_alias;
-};
-struct fn_zone {
-        struct fn_zone          *fz_next;       /* Next not empty zone  */
-        struct hlist_head       *fz_hash;       /* Hash table pointer   */
-        int                     fz_nent;        /* Number of entries    */
-        int                     fz_divisor;     /* Hash divisor         */
-        u32                     fz_hashmask;    /* (fz_divisor - 1)     */
-#define FZ_HASHMASK(fz)         ((fz)->fz_hashmask)
-        int                     fz_order;       /* Zone order           */
-        __be32                  fz_mask;
-#define FZ_MASK(fz)             ((fz)->fz_mask)
-};
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
-struct fn_hash {
-        struct fn_zone  *fn_zones[33];
-        struct fn_zone  *fn_zone_list;
-};
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
-        u32 h = ntohl(key)>>(32 - fz->fz_order);
-        h ^= (h>>20);
-        h ^= (h>>10);
-        h ^= (h>>5);
-        h &= FZ_HASHMASK(fz);
-        return h;
-}
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
-        return dst & FZ_MASK(fz);
-}
-static DEFINE_RWLOCK(fib_hash_lock);
-static unsigned int fib_hash_genid;
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
-        unsigned long size = divisor * sizeof(struct hlist_head);
-        if (size <= PAGE_SIZE) {
-                return kzalloc(size, GFP_KERNEL);
-        } else {
-                return (struct hlist_head *)
-                        __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-        }
-}
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
-                                   struct hlist_head *old_ht,
-                                   int old_divisor)
-{
-        int i;
-        for (i = 0; i < old_divisor; i++) {
-                struct hlist_node *node, *n;
-                struct fib_node *f;
-                hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
-                        struct hlist_head *new_head;
-                        hlist_del(&f->fn_hash);
-                        new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-                        hlist_add_head(&f->fn_hash, new_head);
-                }
-        }
-}
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
-        unsigned long size = divisor * sizeof(struct hlist_head);
-        if (size <= PAGE_SIZE)
-                kfree(hash);
-        else
-                free_pages((unsigned long)hash, get_order(size));
-}
-static void fn_rehash_zone(struct fn_zone *fz)
-{
-        struct hlist_head *ht, *old_ht;
-        int old_divisor, new_divisor;
-        u32 new_hashmask;
-        old_divisor = fz->fz_divisor;
-        switch (old_divisor) {
-        case 16:
-                new_divisor = 256;
-                break;
-        case 256:
-                new_divisor = 1024;
-                break;
-        default:
-                if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
-                        printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
-                        return;
-                }
-                new_divisor = (old_divisor << 1);
-                break;
-        }
-        new_hashmask = (new_divisor - 1);
-#if RT_CACHE_DEBUG >= 2
-        printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
-               fz->fz_order, old_divisor);
-#endif
-        ht = fz_hash_alloc(new_divisor);
-        if (ht) {
-                write_lock_bh(&fib_hash_lock);
-                old_ht = fz->fz_hash;
-                fz->fz_hash = ht;
-                fz->fz_hashmask = new_hashmask;
-                fz->fz_divisor = new_divisor;
-                fn_rebuild_zone(fz, old_ht, old_divisor);
-                fib_hash_genid++;
-                write_unlock_bh(&fib_hash_lock);
-                fz_hash_free(old_ht, old_divisor);
-        }
-}
-static inline void fn_free_node(struct fib_node * f)
-{
-        kmem_cache_free(fn_hash_kmem, f);
-}
-static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
-{
-        fib_release_info(fa->fa_info);
-        if (fa == &f->fn_embedded_alias)
-                fa->fa_info = NULL;
-        else
-                kmem_cache_free(fn_alias_kmem, fa);
-}
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
-        int i;
-        struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
-        if (!fz)
-                return NULL;
-        if (z) {
-                fz->fz_divisor = 16;
-        } else {
-                fz->fz_divisor = 1;
-        }
-        fz->fz_hashmask = (fz->fz_divisor - 1);
-        fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
-        if (!fz->fz_hash) {
-                kfree(fz);
-                return NULL;
-        }
-        fz->fz_order = z;
-        fz->fz_mask = inet_make_mask(z);
-        /* Find the first not empty zone with more specific mask */
-        for (i=z+1; i<=32; i++)
-                if (table->fn_zones[i])
-                        break;
-        write_lock_bh(&fib_hash_lock);
-        if (i>32) {
-                /* No more specific masks, we are the first. */
-                fz->fz_next = table->fn_zone_list;
-                table->fn_zone_list = fz;
-        } else {
-                fz->fz_next = table->fn_zones[i]->fz_next;
-                table->fn_zones[i]->fz_next = fz;
-        }
-        table->fn_zones[z] = fz;
-        fib_hash_genid++;
-        write_unlock_bh(&fib_hash_lock);
-        return fz;
-}
-int fib_table_lookup(struct fib_table *tb,
-                     const struct flowi *flp, struct fib_result *res)
-{
-        int err;
-        struct fn_zone *fz;
-        struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-        read_lock(&fib_hash_lock);
-        for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
-                struct hlist_head *head;
-                struct hlist_node *node;
-                struct fib_node *f;
-                __be32 k = fz_key(flp->fl4_dst, fz);
-                head = &fz->fz_hash[fn_hash(k, fz)];
-                hlist_for_each_entry(f, node, head, fn_hash) {
-                        if (f->fn_key != k)
-                                continue;
-                        err = fib_semantic_match(&f->fn_alias,
-                                                 flp, res,
-                                                 fz->fz_order);
-                        if (err <= 0)
-                                goto out;
-                }
-        }
-        err = 1;
-out:
-        read_unlock(&fib_hash_lock);
-        return err;
-}
-void fib_table_select_default(struct fib_table *tb,
-                              const struct flowi *flp, struct fib_result *res)
-{
-        int order, last_idx;
-        struct hlist_node *node;
-        struct fib_node *f;
-        struct fib_info *fi = NULL;
-        struct fib_info *last_resort;
-        struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-        struct fn_zone *fz = t->fn_zones[0];
-        if (fz == NULL)
-                return;
-        last_idx = -1;
-        last_resort = NULL;
-        order = -1;
-        read_lock(&fib_hash_lock);
-        hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
-                struct fib_alias *fa;
-                list_for_each_entry(fa, &f->fn_alias, fa_list) {
-                        struct fib_info *next_fi = fa->fa_info;
-                        if (fa->fa_scope != res->scope ||
-                            fa->fa_type != RTN_UNICAST)
-                                continue;
-                        if (next_fi->fib_priority > res->fi->fib_priority)
-                                break;
-                        if (!next_fi->fib_nh[0].nh_gw ||
-                            next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
-                                continue;
-                        fa->fa_state |= FA_S_ACCESSED;
-                        if (fi == NULL) {
-                                if (next_fi != res->fi)
-                                        break;
-                        } else if (!fib_detect_death(fi, order, &last_resort,
-                                                &last_idx, tb->tb_default)) {
-                                fib_result_assign(res, fi);
-                                tb->tb_default = order;
-                                goto out;
-                        }
-                        fi = next_fi;
-                        order++;
-                }
-        }
-        if (order <= 0 || fi == NULL) {
-                tb->tb_default = -1;
-                goto out;
-        }
-        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-                                tb->tb_default)) {
-                fib_result_assign(res, fi);
-                tb->tb_default = order;
-                goto out;
-        }
-        if (last_idx >= 0)
-                fib_result_assign(res, last_resort);
-        tb->tb_default = last_idx;
-out:
-        read_unlock(&fib_hash_lock);
-}
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
-        struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-        hlist_add_head(&f->fn_hash, head);
-}
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
-        struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
-        struct hlist_node *node;
-        struct fib_node *f;
-        hlist_for_each_entry(f, node, head, fn_hash) {
-                if (f->fn_key == key)
-                        return f;
-        }
-        return NULL;
-}
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
-{
-        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
-        struct fib_node *new_f = NULL;
-        struct fib_node *f;
-        struct fib_alias *fa, *new_fa;
-        struct fn_zone *fz;
-        struct fib_info *fi;
-        u8 tos = cfg->fc_tos;
-        __be32 key;
-        int err;
-        if (cfg->fc_dst_len > 32)
-                return -EINVAL;
-        fz = table->fn_zones[cfg->fc_dst_len];
-        if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
-                return -ENOBUFS;
-        key = 0;
-        if (cfg->fc_dst) {
-                if (cfg->fc_dst & ~FZ_MASK(fz))
-                        return -EINVAL;
-                key = fz_key(cfg->fc_dst, fz);
-        }
-        fi = fib_create_info(cfg);
-        if (IS_ERR(fi))
-                return PTR_ERR(fi);
-        if (fz->fz_nent > (fz->fz_divisor<<1) &&
-            fz->fz_divisor < FZ_MAX_DIVISOR &&
-            (cfg->fc_dst_len == 32 ||
-             (1 << cfg->fc_dst_len) > fz->fz_divisor))
-                fn_rehash_zone(fz);
-        f = fib_find_node(fz, key);
-        if (!f)
-                fa = NULL;
-        else
-                fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-        /* Now fa, if non-NULL, points to the first fib alias
-         * with the same keys [prefix,tos,priority], if such key already
-         * exists or to the node before which we will insert new one.
-         *
-         * If fa is NULL, we will need to allocate a new one and
-         * insert to the head of f.
-         *
-         * If f is NULL, no fib node matched the destination key
-         * and we need to allocate a new one of those as well.
-         */
-        if (fa && fa->fa_tos == tos &&
-            fa->fa_info->fib_priority == fi->fib_priority) {
-                struct fib_alias *fa_first, *fa_match;
-                err = -EEXIST;
-                if (cfg->fc_nlflags & NLM_F_EXCL)
-                        goto out;
-                /* We have 2 goals:
-                 * 1. Find exact match for type, scope, fib_info to avoid
-                 * duplicate routes
-                 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
-                 */
-                fa_match = NULL;
-                fa_first = fa;
-                fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
-                list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
-                        if (fa->fa_tos != tos)
-                                break;
-                        if (fa->fa_info->fib_priority != fi->fib_priority)
-                                break;
-                        if (fa->fa_type == cfg->fc_type &&
-                            fa->fa_scope == cfg->fc_scope &&
-                            fa->fa_info == fi) {
-                                fa_match = fa;
-                                break;
-                        }
-                }
-                if (cfg->fc_nlflags & NLM_F_REPLACE) {
-                        struct fib_info *fi_drop;
-                        u8 state;
-                        fa = fa_first;
-                        if (fa_match) {
-                                if (fa == fa_match)
-                                        err = 0;
-                                goto out;
-                        }
-                        write_lock_bh(&fib_hash_lock);
-                        fi_drop = fa->fa_info;
-                        fa->fa_info = fi;
-                        fa->fa_type = cfg->fc_type;
-                        fa->fa_scope = cfg->fc_scope;
-                        state = fa->fa_state;
-                        fa->fa_state &= ~FA_S_ACCESSED;
-                        fib_hash_genid++;
-                        write_unlock_bh(&fib_hash_lock);
-                        fib_release_info(fi_drop);
-                        if (state & FA_S_ACCESSED)
-                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-                        rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
-                                  &cfg->fc_nlinfo, NLM_F_REPLACE);
-                        return 0;
-                }
-                /* Error if we find a perfect match which
-                 * uses the same scope, type, and nexthop
-                 * information.
-                 */
-                if (fa_match)
-                        goto out;
-                if (!(cfg->fc_nlflags & NLM_F_APPEND))
-                        fa = fa_first;
-        }
-        err = -ENOENT;
-        if (!(cfg->fc_nlflags & NLM_F_CREATE))
-                goto out;
-        err = -ENOBUFS;
-        if (!f) {
-                new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
-                if (new_f == NULL)
-                        goto out;
-                INIT_HLIST_NODE(&new_f->fn_hash);
-                INIT_LIST_HEAD(&new_f->fn_alias);
-                new_f->fn_key = key;
-                f = new_f;
-        }
-        new_fa = &f->fn_embedded_alias;
-        if (new_fa->fa_info != NULL) {
-                new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
-                if (new_fa == NULL)
-                        goto out;
-        }
-        new_fa->fa_info = fi;
-        new_fa->fa_tos = tos;
-        new_fa->fa_type = cfg->fc_type;
-        new_fa->fa_scope = cfg->fc_scope;
-        new_fa->fa_state = 0;
-        /*
-         * Insert new entry to the list.
-         */
-        write_lock_bh(&fib_hash_lock);
-        if (new_f)
-                fib_insert_node(fz, new_f);
-        list_add_tail(&new_fa->fa_list,
-                 (fa ? &fa->fa_list : &f->fn_alias));
-        fib_hash_genid++;
-        write_unlock_bh(&fib_hash_lock);
-        if (new_f)
-                fz->fz_nent++;
-        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-        rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
-                  &cfg->fc_nlinfo, 0);
-        return 0;
-out:
-        if (new_f)
-                kmem_cache_free(fn_hash_kmem, new_f);
-        fib_release_info(fi);
-        return err;
-}
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
-{
-        struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-        struct fib_node *f;
-        struct fib_alias *fa, *fa_to_delete;
-        struct fn_zone *fz;
-        __be32 key;
-        if (cfg->fc_dst_len > 32)
-                return -EINVAL;
-        if ((fz  = table->fn_zones[cfg->fc_dst_len]) == NULL)
-                return -ESRCH;
-        key = 0;
-        if (cfg->fc_dst) {
-                if (cfg->fc_dst & ~FZ_MASK(fz))
-                        return -EINVAL;
-                key = fz_key(cfg->fc_dst, fz);
-        }
-        f = fib_find_node(fz, key);
-        if (!f)
-                fa = NULL;
-        else
-                fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
-        if (!fa)
-                return -ESRCH;
-        fa_to_delete = NULL;
-        fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
-        list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
-                struct fib_info *fi = fa->fa_info;
-                if (fa->fa_tos != cfg->fc_tos)
-                        break;
-                if ((!cfg->fc_type ||
-                     fa->fa_type == cfg->fc_type) &&
-                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
-                     fa->fa_scope == cfg->fc_scope) &&
-                    (!cfg->fc_protocol ||
-                     fi->fib_protocol == cfg->fc_protocol) &&
-                    fib_nh_match(cfg, fi) == 0) {
-                        fa_to_delete = fa;
-                        break;
-                }
-        }
-        if (fa_to_delete) {
-                int kill_fn;
-                fa = fa_to_delete;
-                rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
-                          tb->tb_id, &cfg->fc_nlinfo, 0);
-                kill_fn = 0;
-                write_lock_bh(&fib_hash_lock);
-                list_del(&fa->fa_list);
-                if (list_empty(&f->fn_alias)) {
-                        hlist_del(&f->fn_hash);
-                        kill_fn = 1;
-                }
-                fib_hash_genid++;
-                write_unlock_bh(&fib_hash_lock);
-                if (fa->fa_state & FA_S_ACCESSED)
-                        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-                fn_free_alias(fa, f);
-                if (kill_fn) {
-                        fn_free_node(f);
-                        fz->fz_nent--;
-                }
-                return 0;
-        }
-        return -ESRCH;
-}
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
-        struct hlist_head *head = &fz->fz_hash[idx];
-        struct hlist_node *node, *n;
-        struct fib_node *f;
-        int found = 0;
-        hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
-                struct fib_alias *fa, *fa_node;
-                int kill_f;
-                kill_f = 0;
-                list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
-                        struct fib_info *fi = fa->fa_info;
-                        if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-                                write_lock_bh(&fib_hash_lock);
-                                list_del(&fa->fa_list);
-                                if (list_empty(&f->fn_alias)) {
-                                        hlist_del(&f->fn_hash);
-                                        kill_f = 1;
-                                }
-                                fib_hash_genid++;
-                                write_unlock_bh(&fib_hash_lock);
-                                fn_free_alias(fa, f);
-                                found++;
-                        }
-                }
-                if (kill_f) {
-                        fn_free_node(f);
-                        fz->fz_nent--;
-                }
-        }
-        return found;
-}
-int fib_table_flush(struct fib_table *tb)
-{
-        struct fn_hash *table = (struct fn_hash *) tb->tb_data;
-        struct fn_zone *fz;
-        int found = 0;
-        for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
-                int i;
-                for (i = fz->fz_divisor - 1; i >= 0; i--)
-                        found += fn_flush_list(fz, i);
-        }
-        return found;
-}
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
-                     struct fib_table *tb,
-                     struct fn_zone *fz,
-                     struct hlist_head *head)
-{
-        struct hlist_node *node;
-        struct fib_node *f;
-        int i, s_i;
-        s_i = cb->args[4];
-        i = 0;
-        hlist_for_each_entry(f, node, head, fn_hash) {
-                struct fib_alias *fa;
-                list_for_each_entry(fa, &f->fn_alias, fa_list) {
-                        if (i < s_i)
-                                goto next;
-                        if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
-                                          cb->nlh->nlmsg_seq,
-                                          RTM_NEWROUTE,
-                                          tb->tb_id,
-                                          fa->fa_type,
-                                          fa->fa_scope,
-                                          f->fn_key,
-                                          fz->fz_order,
-                                          fa->fa_tos,
-                                          fa->fa_info,
-                                          NLM_F_MULTI) < 0) {
-                                cb->args[4] = i;
-                                return -1;
-                        }
-                next:
-                        i++;
-                }
-        }
-        cb->args[4] = i;
-        return skb->len;
-}
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
-                   struct fib_table *tb,
-                   struct fn_zone *fz)
-{
-        int h, s_h;
-        if (fz->fz_hash == NULL)
-                return skb->len;
-        s_h = cb->args[3];
-        for (h = s_h; h < fz->fz_divisor; h++) {
-                if (hlist_empty(&fz->fz_hash[h]))
-                        continue;
-                if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) {
-                        cb->args[3] = h;
-                        return -1;
-                }
-                memset(&cb->args[4], 0,
-                       sizeof(cb->args) - 4*sizeof(cb->args[0]));
-        }
-        cb->args[3] = h;
-        return skb->len;
-}
-int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-                   struct netlink_callback *cb)
-{
-        int m, s_m;
-        struct fn_zone *fz;
-        struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-        s_m = cb->args[2];
-        read_lock(&fib_hash_lock);
-        for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
-                if (m < s_m) continue;
-                if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
-                        cb->args[2] = m;
-                        read_unlock(&fib_hash_lock);
-                        return -1;
-                }
-                memset(&cb->args[3], 0,
-                       sizeof(cb->args) - 3*sizeof(cb->args[0]));
-        }
-        read_unlock(&fib_hash_lock);
-        cb->args[2] = m;
-        return skb->len;
-}
-void __init fib_hash_init(void)
-{
-        fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
-                                         0, SLAB_PANIC, NULL);
-        fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
-                                          0, SLAB_PANIC, NULL);
-}
-struct fib_table *fib_hash_table(u32 id)
-{
-        struct fib_table *tb;
-        tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
-                     GFP_KERNEL);
-        if (tb == NULL)
-                return NULL;
-        tb->tb_id = id;
-        tb->tb_default = -1;
-        memset(tb->tb_data, 0, sizeof(struct fn_hash));
-        return tb;
-}
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-struct fib_iter_state {
-        struct seq_net_private p;
-        struct fn_zone  *zone;
-        int             bucket;
-        struct hlist_head *hash_head;
-        struct fib_node *fn;
-        struct fib_alias *fa;
-        loff_t pos;
-        unsigned int genid;
-        int valid;
-};
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
-        struct fib_iter_state *iter = seq->private;
-        struct fib_table *main_table;
-        struct fn_hash *table;
-        main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
-        table = (struct fn_hash *)main_table->tb_data;
-        iter->bucket    = 0;
-        iter->hash_head = NULL;
-        iter->fn        = NULL;
-        iter->fa        = NULL;
-        iter->pos       = 0;
-        iter->genid     = fib_hash_genid;
-        iter->valid     = 1;
-        for (iter->zone = table->fn_zone_list; iter->zone;
-             iter->zone = iter->zone->fz_next) {
-                int maxslot;
-                if (!iter->zone->fz_nent)
-                        continue;
-                iter->hash_head = iter->zone->fz_hash;
-                maxslot = iter->zone->fz_divisor;
-                for (iter->bucket = 0; iter->bucket < maxslot;
-                     ++iter->bucket, ++iter->hash_head) {
-                        struct hlist_node *node;
-                        struct fib_node *fn;
-                        hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
-                                struct fib_alias *fa;
-                                list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-                                        iter->fn = fn;
-                                        iter->fa = fa;
-                                        goto out;
-                                }
-                        }
-                }
-        }
-out:
-        return iter->fa;
-}
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
-        struct fib_iter_state *iter = seq->private;
-        struct fib_node *fn;
-        struct fib_alias *fa;
-        /* Advance FA, if any. */
-        fn = iter->fn;
-        fa = iter->fa;
-        if (fa) {
-                BUG_ON(!fn);
-                list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
-                        iter->fa = fa;
-                        goto out;
-                }
-        }
-        fa = iter->fa = NULL;
-        /* Advance FN. */
-        if (fn) {
-                struct hlist_node *node = &fn->fn_hash;
-                hlist_for_each_entry_continue(fn, node, fn_hash) {
-                        iter->fn = fn;
-                        list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-                                iter->fa = fa;
-                                goto out;
-                        }
-                }
-        }
-        fn = iter->fn = NULL;
-        /* Advance hash chain. */
-        if (!iter->zone)
-                goto out;
-        for (;;) {
-                struct hlist_node *node;
-                int maxslot;
-                maxslot = iter->zone->fz_divisor;
-                while (++iter->bucket < maxslot) {
-                        iter->hash_head++;
-                        hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
-                                list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-                                        iter->fn = fn;
-                                        iter->fa = fa;
-                                        goto out;
-                                }
-                        }
-                }
-                iter->zone = iter->zone->fz_next;
-                if (!iter->zone)
-                        goto out;
-                iter->bucket = 0;
-                iter->hash_head = iter->zone->fz_hash;
-                hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
-                        list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-                                iter->fn = fn;
-                                iter->fa = fa;
-                                goto out;
-                        }
-                }
-        }
-out:
-        iter->pos++;
-        return fa;
-}
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
-        struct fib_iter_state *iter = seq->private;
-        struct fib_alias *fa;
-        if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
-                fa   = iter->fa;
-                pos -= iter->pos;
-        } else
-                fa = fib_get_first(seq);
-        if (fa)
-                while (pos && (fa = fib_get_next(seq)))
-                        --pos;
-        return pos ? NULL : fa;
-}
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-        __acquires(fib_hash_lock)
-{
-        void *v = NULL;
-        read_lock(&fib_hash_lock);
-        if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
-                v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-        return v;
-}
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-        ++*pos;
-        return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-static void fib_seq_stop(struct seq_file *seq, void *v)
-        __releases(fib_hash_lock)
-{
-        read_unlock(&fib_hash_lock);
-}
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
-        static const unsigned type2flags[RTN_MAX + 1] = {
-                [7] = RTF_REJECT, [8] = RTF_REJECT,
-        };
-        unsigned flags = type2flags[type];
-        if (fi && fi->fib_nh->nh_gw)
-                flags |= RTF_GATEWAY;
-        if (mask == htonl(0xFFFFFFFF))
-                flags |= RTF_HOST;
-        flags |= RTF_UP;
-        return flags;
-}
-/*
- *      This outputs /proc/net/route.
- *
- *      It always works in backward compatibility mode.
- *      The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
-        struct fib_iter_state *iter;
-        int len;
-        __be32 prefix, mask;
-        unsigned flags;
-        struct fib_node *f;
-        struct fib_alias *fa;
-        struct fib_info *fi;
-        if (v == SEQ_START_TOKEN) {
-                seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
-                           "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
-                           "\tWindow\tIRTT");
-                goto out;
-        }
-        iter    = seq->private;
-        f       = iter->fn;
-        fa      = iter->fa;
-        fi      = fa->fa_info;
-        prefix  = f->fn_key;
-        mask    = FZ_MASK(iter->zone);
-        flags   = fib_flag_trans(fa->fa_type, mask, fi);
-        if (fi)
-                seq_printf(seq,
-                         "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
-                         fi->fib_dev ? fi->fib_dev->name : "*", prefix,
-                         fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
-                         mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
-                         fi->fib_window,
-                         fi->fib_rtt >> 3, &len);
-        else
-                seq_printf(seq,
-                         "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
-                         prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
-        seq_printf(seq, "%*s\n", 127 - len, "");
-out:
-        return 0;
-}
-static const struct seq_operations fib_seq_ops = {
-        .start  = fib_seq_start,
-        .next   = fib_seq_next,
-        .stop   = fib_seq_stop,
-        .show   = fib_seq_show,
-};
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
-        return seq_open_net(inode, file, &fib_seq_ops,
-                            sizeof(struct fib_iter_state));
-}
-static const struct file_operations fib_seq_fops = {
-        .owner          = THIS_MODULE,
-        .open           = fib_seq_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release_net,
-};
-int __net_init fib_proc_init(struct net *net)
-{
-        if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
-                return -ENOMEM;
-        return 0;
-}
-void __net_exit fib_proc_exit(struct net *net)
-{
-        proc_net_remove(net, "route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..af0f14aba169 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -10,24 +10,25 @@ struct fib_alias {
        struct fib_info         *fa_info;
        u8                      fa_tos;
        u8                      fa_type;
-        u8                      fa_scope;
        u8                      fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
        struct rcu_head         rcu;
-#endif
 };
 #define FA_S_ACCESSED   0x01
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+        if (!(fa->fa_state & FA_S_ACCESSED))
+                fa->fa_state |= FA_S_ACCESSED;
+}
 /* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
-                              const struct flowi *flp,
-                              struct fib_result *res, int prefixlen);
 extern void fib_release_info(struct fib_info *);
 extern struct fib_info *fib_create_info(struct fib_config *cfg);
 extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
 extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-                         u32 tb_id, u8 type, u8 scope, __be32 dst,
+                         u32 tb_id, u8 type, __be32 dst,
                         int dst_len, u8 tos, struct fib_info *fi,
                         unsigned int);
 extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
@@ -42,11 +43,15 @@ extern int fib_detect_death(struct fib_info *fi, int order,
 static inline void fib_result_assign(struct fib_result *res,
                                     struct fib_info *fi)
 {
-        if (res->fi != NULL)
+        /* we used to play games with refcounts, but we now use RCU */
-                fib_info_put(res->fi);
        res->fi = fi;
-        if (fi != NULL)
-                atomic_inc(&fi->fib_clntref);
 }
+struct fib_prop {
+        int     error;
+        u8      scope;
+};
+extern const struct fib_prop fib_props[RTN_MAX + 1];
 #endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff564..a53bb1b5b118 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
 *              IPv4 Forwarding Information Base: policy rules.
 *
 * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *              Thomas Graf <tgraf@suug.ch>
+ *              Thomas Graf <tgraf@suug.ch>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
 *              2 of the License, or (at your option) any later version.
 *
 * Fixes:
- *              Rani Assaf      :       local_rule cannot be deleted
+ *              Rani Assaf      :       local_rule cannot be deleted
 *              Marc Boucher    :       routing by fwmark
 */
@@ -32,8 +32,7 @@
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
-struct fib4_rule
+struct fib4_rule {
-{
        struct fib_rule         common;
        u8                      dst_len;
        u8                      src_len;
@@ -42,26 +41,27 @@ struct fib4_rule
        __be32                  srcmask;
        __be32                  dst;
        __be32                  dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        u32                     tclassid;
 #endif
 };
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
-u32 fib_rules_tclass(struct fib_result *res)
+u32 fib_rules_tclass(const struct fib_result *res)
 {
        return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
 }
 #endif
-int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
+int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 {
        struct fib_lookup_arg arg = {
                .result = res,
+                .flags = FIB_LOOKUP_NOREF,
        };
        int err;
-        err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg);
+        err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
        res->r = arg.rule;
        return err;
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
                goto errout;
        }
-        if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+        tbl = fib_get_table(rule->fr_net, rule->table);
+        if (!tbl)
                goto errout;
-        err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
+        err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
        if (err > 0)
                err = -EAGAIN;
 errout:
@@ -105,14 +106,15 @@ errout:
 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
        struct fib4_rule *r = (struct fib4_rule *) rule;
-        __be32 daddr = fl->fl4_dst;
+        struct flowi4 *fl4 = &fl->u.ip4;
-        __be32 saddr = fl->fl4_src;
+        __be32 daddr = fl4->daddr;
+        __be32 saddr = fl4->saddr;
        if (((saddr ^ r->src) & r->srcmask) ||
            ((daddr ^ r->dst) & r->dstmask))
                return 0;
-        if (r->tos && (r->tos != fl->fl4_tos))
+        if (r->tos && (r->tos != fl4->flowi4_tos))
                return 0;
        return 1;
@@ -164,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
        if (frh->dst_len)
                rule4->dst = nla_get_be32(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        if (tb[FRA_FLOW])
                rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
 #endif
@@ -194,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
        if (frh->tos && (rule4->tos != frh->tos))
                return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
                return 0;
 #endif
@@ -223,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
        if (rule4->src_len)
                NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        if (rule4->tclassid)
                NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
 #endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e8..33e2c35b74b7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
 static DEFINE_SPINLOCK(fib_info_lock);
 static struct hlist_head *fib_info_hash;
 static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
 static unsigned int fib_info_cnt;
 #define DEVINDEX_HASHBITS 8
@@ -60,89 +60,93 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 static DEFINE_SPINLOCK(fib_multipath_lock);
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
+#define for_nexthops(fi) {                                              \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+        int nhsel; const struct fib_nh *nh;                             \
+        for (nhsel = 0, nh = (fi)->fib_nh;                              \
+             nhsel < (fi)->fib_nhs;                                     \
+             nh++, nhsel++)
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
+#define change_nexthops(fi) {                                           \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+        int nhsel; struct fib_nh *nexthop_nh;                           \
+        for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh);   \
+             nhsel < (fi)->fib_nhs;                                     \
+             nexthop_nh++, nhsel++)
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 /* Hope, that gcc will optimize it to get rid of dummy loop */
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
+#define for_nexthops(fi) {                                              \
-for (nhsel=0; nhsel < 1; nhsel++)
+        int nhsel; const struct fib_nh *nh = (fi)->fib_nh;              \
+        for (nhsel = 0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+#define change_nexthops(fi) {                                           \
-for (nhsel=0; nhsel < 1; nhsel++)
+        int nhsel;                                                      \
+        struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);    \
+        for (nhsel = 0; nhsel < 1; nhsel++)
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 #define endfor_nexthops(fi) }
-static const struct
+const struct fib_prop fib_props[RTN_MAX + 1] = {
-{
+        [RTN_UNSPEC] = {
-        int     error;
-        u8      scope;
-} fib_props[RTN_MAX + 1] = {
-        {
                .error  = 0,
                .scope  = RT_SCOPE_NOWHERE,
-        },      /* RTN_UNSPEC */
+        },
-        {
+        [RTN_UNICAST] = {
                .error  = 0,
                .scope  = RT_SCOPE_UNIVERSE,
-        },      /* RTN_UNICAST */
+        },
-        {
+        [RTN_LOCAL] = {
                .error  = 0,
                .scope  = RT_SCOPE_HOST,
-        },      /* RTN_LOCAL */
+        },
-        {
+        [RTN_BROADCAST] = {
                .error  = 0,
                .scope  = RT_SCOPE_LINK,
-        },      /* RTN_BROADCAST */
+        },
-        {
+        [RTN_ANYCAST] = {
                .error  = 0,
                .scope  = RT_SCOPE_LINK,
-        },      /* RTN_ANYCAST */
+        },
-        {
+        [RTN_MULTICAST] = {
                .error  = 0,
                .scope  = RT_SCOPE_UNIVERSE,
-        },      /* RTN_MULTICAST */
+        },
-        {
+        [RTN_BLACKHOLE] = {
                .error  = -EINVAL,
                .scope  = RT_SCOPE_UNIVERSE,
-        },      /* RTN_BLACKHOLE */
+        },
-        {
+        [RTN_UNREACHABLE] = {
                .error  = -EHOSTUNREACH,
                .scope  = RT_SCOPE_UNIVERSE,
-        },      /* RTN_UNREACHABLE */
+        },
-        {
+        [RTN_PROHIBIT] = {
                .error  = -EACCES,
                .scope  = RT_SCOPE_UNIVERSE,
-        },      /* RTN_PROHIBIT */
+        },
-        {
+        [RTN_THROW] = {
                .error  = -EAGAIN,
                .scope  = RT_SCOPE_UNIVERSE,
-        },      /* RTN_THROW */
+        },
-        {
+        [RTN_NAT] = {
                .error  = -EINVAL,
                .scope  = RT_SCOPE_NOWHERE,
-        },      /* RTN_NAT */
+        },
-        {
+        [RTN_XRESOLVE] = {
                .error  = -EINVAL,
                .scope  = RT_SCOPE_NOWHERE,
-        },      /* RTN_XRESOLVE */
+        },
 };
 /* Release a nexthop info record */
 void free_fib_info(struct fib_info *fi)
 {
        if (fi->fib_dead == 0) {
-                printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+                pr_warning("Freeing alive fib_info %p\n", fi);
                return;
        }
        change_nexthops(fi) {
@@ -152,7 +156,7 @@ void free_fib_info(struct fib_info *fi)
        } endfor_nexthops(fi);
        fib_info_cnt--;
        release_net(fi->fib_net);
-        kfree(fi);
+        kfree_rcu(fi, rcu);
 }
 void fib_release_info(struct fib_info *fi)
@@ -173,7 +177,7 @@ void fib_release_info(struct fib_info *fi)
        spin_unlock_bh(&fib_info_lock);
 }
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 {
        const struct fib_nh *onh = ofi->fib_nh;
@@ -184,10 +188,10 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
                    nh->nh_weight != onh->nh_weight ||
 #endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-                    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+                    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
                        return -1;
                onh++;
        } endfor_nexthops(fi);
@@ -205,10 +209,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 {
-        unsigned int mask = (fib_hash_size - 1);
+        unsigned int mask = (fib_info_hash_size - 1);
        unsigned int val = fi->fib_nhs;
-        val ^= fi->fib_protocol;
+        val ^= (fi->fib_protocol << 8) | fi->fib_scope;
        val ^= (__force u32)fi->fib_prefsrc;
        val ^= fi->fib_priority;
        for_nexthops(fi) {
@@ -234,11 +238,12 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
                if (fi->fib_nhs != nfi->fib_nhs)
                        continue;
                if (nfi->fib_protocol == fi->fib_protocol &&
+                    nfi->fib_scope == fi->fib_scope &&
                    nfi->fib_prefsrc == fi->fib_prefsrc &&
                    nfi->fib_priority == fi->fib_priority &&
                    memcmp(nfi->fib_metrics, fi->fib_metrics,
-                           sizeof(fi->fib_metrics)) == 0 &&
+                           sizeof(u32) * RTAX_MAX) == 0 &&
-                    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+                    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
                    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
                        return fi;
        }
@@ -247,9 +252,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 }
 /* Check, that the gateway is already configured.
-   Used only by redirect accept routine.
+ * Used only by redirect accept routine.
 */
 int ip_fib_check_default(__be32 gw, struct net_device *dev)
 {
        struct hlist_head *head;
@@ -264,7 +268,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
        hlist_for_each_entry(nh, node, head, nh_hash) {
                if (nh->nh_dev == dev &&
                    nh->nh_gw == gw &&
-                    !(nh->nh_flags&RTNH_F_DEAD)) {
+                    !(nh->nh_flags & RTNH_F_DEAD)) {
                        spin_unlock(&fib_info_lock);
                        return 0;
                }
@@ -315,7 +319,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
                goto errout;
        err = fib_dump_info(skb, info->pid, seq, event, tb_id,
-                            fa->fa_type, fa->fa_scope, key, dst_len,
+                            fa->fa_type, key, dst_len,
                            fa->fa_tos, fa->fa_info, nlm_flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -362,10 +366,10 @@ int fib_detect_death(struct fib_info *fi, int order,
        }
        if (state == NUD_REACHABLE)
                return 0;
-        if ((state&NUD_VALID) && order != dflt)
+        if ((state & NUD_VALID) && order != dflt)
                return 0;
-        if ((state&NUD_VALID) ||
+        if ((state & NUD_VALID) ||
-            (*last_idx<0 && order > dflt)) {
+            (*last_idx < 0 && order > dflt)) {
                *last_resort = fi;
                *last_idx = order;
        }
@@ -407,7 +411,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
 #endif
@@ -461,7 +465,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla && nla_get_be32(nla) != nh->nh_gw)
                                return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        if (nla && nla_get_u32(nla) != nh->nh_tclassid)
                                return 1;
@@ -476,145 +480,146 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 /*
-   Picture
+ * Picture
-   -------
+ * -------
+ *
-   Semantics of nexthop is very messy by historical reasons.
+ * Semantics of nexthop is very messy by historical reasons.
-   We have to take into account, that:
+ * We have to take into account, that:
-   a) gateway can be actually local interface address,
+ * a) gateway can be actually local interface address,
-      so that gatewayed route is direct.
+ *    so that gatewayed route is direct.
-   b) gateway must be on-link address, possibly
+ * b) gateway must be on-link address, possibly
-      described not by an ifaddr, but also by a direct route.
+ *    described not by an ifaddr, but also by a direct route.
-   c) If both gateway and interface are specified, they should not
+ * c) If both gateway and interface are specified, they should not
-      contradict.
+ *    contradict.
-   d) If we use tunnel routes, gateway could be not on-link.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
-   Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
-   results in pretty ugly and hairy code with obscure logic.
+ * results in pretty ugly and hairy code with obscure logic.
+ *
-   I chose to generalized it instead, so that the size
+ * I chose to generalized it instead, so that the size
-   of code does not increase practically, but it becomes
+ * of code does not increase practically, but it becomes
-   much more general.
+ * much more general.
-   Every prefix is assigned a "scope" value: "host" is local address,
+ * Every prefix is assigned a "scope" value: "host" is local address,
-   "link" is direct route,
+ * "link" is direct route,
-   [ ... "site" ... "interior" ... ]
+ * [ ... "site" ... "interior" ... ]
-   and "universe" is true gateway route with global meaning.
+ * and "universe" is true gateway route with global meaning.
+ *
-   Every prefix refers to a set of "nexthop"s (gw, oif),
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
-   where gw must have narrower scope. This recursion stops
+ * where gw must have narrower scope. This recursion stops
-   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
-   which means that gw is forced to be on link.
+ * which means that gw is forced to be on link.
+ *
-   Code is still hairy, but now it is apparently logically
+ * Code is still hairy, but now it is apparently logically
-   consistent and very flexible. F.e. as by-product it allows
+ * consistent and very flexible. F.e. as by-product it allows
-   to co-exists in peace independent exterior and interior
+ * to co-exists in peace independent exterior and interior
-   routing processes.
+ * routing processes.
+ *
-   Normally it looks as following.
+ * Normally it looks as following.
+ *
-   {universe prefix}  -> (gw, oif) [scope link]
+ * {universe prefix}  -> (gw, oif) [scope link]
-                          |
+ *                |
-                          |-> {link prefix} -> (gw, oif) [scope local]
+ *                |-> {link prefix} -> (gw, oif) [scope local]
-                                                |
+ *                                      |
-                                                |-> {local prefix} (terminal node)
+ *                                      |-> {local prefix} (terminal node)
 */
 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
                        struct fib_nh *nh)
 {
        int err;
        struct net *net;
+        struct net_device *dev;
        net = cfg->fc_nlinfo.nl_net;
        if (nh->nh_gw) {
                struct fib_result res;
-                if (nh->nh_flags&RTNH_F_ONLINK) {
+                if (nh->nh_flags & RTNH_F_ONLINK) {
-                        struct net_device *dev;
                        if (cfg->fc_scope >= RT_SCOPE_LINK)
                                return -EINVAL;
                        if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
                                return -EINVAL;
-                        if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+                        dev = __dev_get_by_index(net, nh->nh_oif);
+                        if (!dev)
                                return -ENODEV;
-                        if (!(dev->flags&IFF_UP))
+                        if (!(dev->flags & IFF_UP))
                                return -ENETDOWN;
                        nh->nh_dev = dev;
                        dev_hold(dev);
                        nh->nh_scope = RT_SCOPE_LINK;
                        return 0;
                }
+                rcu_read_lock();
                {
-                        struct flowi fl = {
+                        struct flowi4 fl4 = {
-                                .nl_u = {
+                                .daddr = nh->nh_gw,
-                                        .ip4_u = {
+                                .flowi4_scope = cfg->fc_scope + 1,
-                                                .daddr = nh->nh_gw,
+                                .flowi4_oif = nh->nh_oif,
-                                                .scope = cfg->fc_scope + 1,
-                                        },
-                                },
-                                .oif = nh->nh_oif,
                        };
                        /* It is not necessary, but requires a bit of thinking */
-                        if (fl.fl4_scope < RT_SCOPE_LINK)
+                        if (fl4.flowi4_scope < RT_SCOPE_LINK)
-                                fl.fl4_scope = RT_SCOPE_LINK;
+                                fl4.flowi4_scope = RT_SCOPE_LINK;
-                        if ((err = fib_lookup(net, &fl, &res)) != 0)
+                        err = fib_lookup(net, &fl4, &res);
+                        if (err) {
+                                rcu_read_unlock();
                                return err;
+                        }
                }
                err = -EINVAL;
                if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
                        goto out;
                nh->nh_scope = res.scope;
                nh->nh_oif = FIB_RES_OIF(res);
-                if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+                nh->nh_dev = dev = FIB_RES_DEV(res);
+                if (!dev)
                        goto out;
-                dev_hold(nh->nh_dev);
+                dev_hold(dev);
-                err = -ENETDOWN;
+                err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
-                if (!(nh->nh_dev->flags & IFF_UP))
-                        goto out;
-                err = 0;
-out:
-                fib_res_put(&res);
-                return err;
        } else {
                struct in_device *in_dev;
-                if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+                if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
                        return -EINVAL;
+                rcu_read_lock();
+                err = -ENODEV;
                in_dev = inetdev_by_index(net, nh->nh_oif);
                if (in_dev == NULL)
-                        return -ENODEV;
+                        goto out;
-                if (!(in_dev->dev->flags&IFF_UP)) {
+                err = -ENETDOWN;
-                        in_dev_put(in_dev);
+                if (!(in_dev->dev->flags & IFF_UP))
-                        return -ENETDOWN;
+                        goto out;
-                }
                nh->nh_dev = in_dev->dev;
                dev_hold(nh->nh_dev);
                nh->nh_scope = RT_SCOPE_HOST;
-                in_dev_put(in_dev);
+                err = 0;
        }
-        return 0;
+out:
+        rcu_read_unlock();
+        return err;
 }
 static inline unsigned int fib_laddr_hashfn(__be32 val)
 {
-        unsigned int mask = (fib_hash_size - 1);
+        unsigned int mask = (fib_info_hash_size - 1);
-        return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+        return ((__force u32)val ^
+                ((__force u32)val >> 7) ^
+                ((__force u32)val >> 14)) & mask;
 }
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
 {
        if (bytes <= PAGE_SIZE)
                return kzalloc(bytes, GFP_KERNEL);
        else
                return (struct hlist_head *)
-                        __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+                        __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                         get_order(bytes));
 }
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
 {
        if (!hash)
                return;
@@ -625,18 +630,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
                free_pages((unsigned long) hash, get_order(bytes));
 }
-static void fib_hash_move(struct hlist_head *new_info_hash,
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
-                          struct hlist_head *new_laddrhash,
+                               struct hlist_head *new_laddrhash,
-                          unsigned int new_size)
+                               unsigned int new_size)
 {
        struct hlist_head *old_info_hash, *old_laddrhash;
-        unsigned int old_size = fib_hash_size;
+        unsigned int old_size = fib_info_hash_size;
        unsigned int i, bytes;
        spin_lock_bh(&fib_info_lock);
        old_info_hash = fib_info_hash;
        old_laddrhash = fib_info_laddrhash;
-        fib_hash_size = new_size;
+        fib_info_hash_size = new_size;
        for (i = 0; i < old_size; i++) {
                struct hlist_head *head = &fib_info_hash[i];
@@ -677,8 +682,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
        spin_unlock_bh(&fib_info_lock);
        bytes = old_size * sizeof(struct hlist_head *);
-        fib_hash_free(old_info_hash, bytes);
+        fib_info_hash_free(old_info_hash, bytes);
-        fib_hash_free(old_laddrhash, bytes);
+        fib_info_hash_free(old_laddrhash, bytes);
+}
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+{
+        nh->nh_saddr = inet_select_addr(nh->nh_dev,
+                                        nh->nh_gw,
+                                        nh->nh_parent->fib_scope);
+        nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+        return nh->nh_saddr;
 }
 struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -689,6 +704,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
        int nhs = 1;
        struct net *net = cfg->fc_nlinfo.nl_net;
+        if (cfg->fc_type > RTN_MAX)
+                goto err_inval;
        /* Fast check to catch the most weird cases */
        if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
                goto err_inval;
@@ -702,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 #endif
        err = -ENOBUFS;
-        if (fib_info_cnt >= fib_hash_size) {
+        if (fib_info_cnt >= fib_info_hash_size) {
-                unsigned int new_size = fib_hash_size << 1;
+                unsigned int new_size = fib_info_hash_size << 1;
                struct hlist_head *new_info_hash;
                struct hlist_head *new_laddrhash;
                unsigned int bytes;
@@ -711,25 +729,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                if (!new_size)
                        new_size = 1;
                bytes = new_size * sizeof(struct hlist_head *);
-                new_info_hash = fib_hash_alloc(bytes);
+                new_info_hash = fib_info_hash_alloc(bytes);
-                new_laddrhash = fib_hash_alloc(bytes);
+                new_laddrhash = fib_info_hash_alloc(bytes);
                if (!new_info_hash || !new_laddrhash) {
-                        fib_hash_free(new_info_hash, bytes);
+                        fib_info_hash_free(new_info_hash, bytes);
-                        fib_hash_free(new_laddrhash, bytes);
+                        fib_info_hash_free(new_laddrhash, bytes);
                } else
-                        fib_hash_move(new_info_hash, new_laddrhash, new_size);
+                        fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
-                if (!fib_hash_size)
+                if (!fib_info_hash_size)
                        goto failure;
        }
        fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
        if (fi == NULL)
                goto failure;
+        if (cfg->fc_mx) {
+                fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+                if (!fi->fib_metrics)
+                        goto failure;
+        } else
+                fi->fib_metrics = (u32 *) dst_default_metrics;
        fib_info_cnt++;
        fi->fib_net = hold_net(net);
        fi->fib_protocol = cfg->fc_protocol;
+        fi->fib_scope = cfg->fc_scope;
        fi->fib_flags = cfg->fc_flags;
        fi->fib_priority = cfg->fc_priority;
        fi->fib_prefsrc = cfg->fc_prefsrc;
@@ -763,7 +788,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                        goto err_inval;
                if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
                        goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
                        goto err_inval;
 #endif
@@ -776,7 +801,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                nh->nh_oif = cfg->fc_oif;
                nh->nh_gw = cfg->fc_gw;
                nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                nh->nh_tclassid = cfg->fc_flow;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -788,6 +813,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
                        goto err_inval;
                goto link_it;
+        } else {
+                switch (cfg->fc_type) {
+                case RTN_UNICAST:
+                case RTN_LOCAL:
+                case RTN_BROADCAST:
+                case RTN_ANYCAST:
+                case RTN_MULTICAST:
+                        break;
+                default:
+                        goto err_inval;
+                }
        }
        if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -806,7 +842,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                        goto failure;
        } else {
                change_nexthops(fi) {
-                        if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+                        err = fib_check_nh(cfg, fi, nexthop_nh);
+                        if (err != 0)
                                goto failure;
                } endfor_nexthops(fi)
        }
@@ -818,8 +855,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                                goto err_inval;
        }
+        change_nexthops(fi) {
+                fib_info_update_nh_saddr(net, nexthop_nh);
+        } endfor_nexthops(fi)
 link_it:
-        if ((ofi = fib_find_info(fi)) != NULL) {
+        ofi = fib_find_info(fi);
+        if (ofi) {
                fi->fib_dead = 1;
                free_fib_info(fi);
                ofi->fib_treeref++;
@@ -862,86 +904,8 @@ failure:
        return ERR_PTR(err);
 }
-/* Note! fib_semantic_match intentionally uses  RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
-                       struct fib_result *res, int prefixlen)
-{
-        struct fib_alias *fa;
-        int nh_sel = 0;
-        list_for_each_entry_rcu(fa, head, fa_list) {
-                int err;
-                if (fa->fa_tos &&
-                    fa->fa_tos != flp->fl4_tos)
-                        continue;
-                if (fa->fa_scope < flp->fl4_scope)
-                        continue;
-                fa->fa_state |= FA_S_ACCESSED;
-                err = fib_props[fa->fa_type].error;
-                if (err == 0) {
-                        struct fib_info *fi = fa->fa_info;
-                        if (fi->fib_flags & RTNH_F_DEAD)
-                                continue;
-                        switch (fa->fa_type) {
-                        case RTN_UNICAST:
-                        case RTN_LOCAL:
-                        case RTN_BROADCAST:
-                        case RTN_ANYCAST:
-                        case RTN_MULTICAST:
-                                for_nexthops(fi) {
-                                        if (nh->nh_flags&RTNH_F_DEAD)
-                                                continue;
-                                        if (!flp->oif || flp->oif == nh->nh_oif)
-                                                break;
-                                }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-                                if (nhsel < fi->fib_nhs) {
-                                        nh_sel = nhsel;
-                                        goto out_fill_res;
-                                }
-#else
-                                if (nhsel < 1) {
-                                        goto out_fill_res;
-                                }
-#endif
-                                endfor_nexthops(fi);
-                                continue;
-                        default:
-                                printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
-                                        fa->fa_type);
-                                return -EINVAL;
-                        }
-                }
-                return err;
-        }
-        return 1;
-out_fill_res:
-        res->prefixlen = prefixlen;
-        res->nh_sel = nh_sel;
-        res->type = fa->fa_type;
-        res->scope = fa->fa_scope;
-        res->fi = fa->fa_info;
-        atomic_inc(&res->fi->fib_clntref);
-        return 0;
-}
-/* Find appropriate source address to this destination */
-__be32 __fib_res_prefsrc(struct fib_result *res)
-{
-        return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
-}
 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-                  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
+                  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
                  struct fib_info *fi, unsigned int flags)
 {
        struct nlmsghdr *nlh;
@@ -963,7 +927,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
        NLA_PUT_U32(skb, RTA_TABLE, tb_id);
        rtm->rtm_type = type;
        rtm->rtm_flags = fi->fib_flags;
-        rtm->rtm_scope = scope;
+        rtm->rtm_scope = fi->fib_scope;
        rtm->rtm_protocol = fi->fib_protocol;
        if (rtm->rtm_dst_len)
@@ -984,7 +948,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                if (fi->fib_nh->nh_oif)
                        NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                if (fi->fib_nh[0].nh_tclassid)
                        NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
 #endif
@@ -1009,7 +973,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                        if (nh->nh_gw)
                                NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
                        if (nh->nh_tclassid)
                                NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
 #endif
@@ -1028,10 +992,10 @@ nla_put_failure:
 }
 /*
-   Update FIB if:
+ * Update FIB if:
-   - local address disappeared -> we must delete all the entries
+ * - local address disappeared -> we must delete all the entries
-     referring to it.
+ *   referring to it.
-   - device went down -> we must shutdown all nexthops going via it.
+ * - device went down -> we must shutdown all nexthops going via it.
 */
 int fib_sync_down_addr(struct net *net, __be32 local)
 {
@@ -1078,7 +1042,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
                prev_fi = fi;
                dead = 0;
                change_nexthops(fi) {
-                        if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+                        if (nexthop_nh->nh_flags & RTNH_F_DEAD)
                                dead++;
                        else if (nexthop_nh->nh_dev == dev &&
                                 nexthop_nh->nh_scope != scope) {
@@ -1107,13 +1071,68 @@ int fib_sync_down_dev(struct net_device *dev, int force)
        return ret;
 }
+/* Must be invoked inside of an RCU protected region.  */
+void fib_select_default(struct fib_result *res)
+{
+        struct fib_info *fi = NULL, *last_resort = NULL;
+        struct list_head *fa_head = res->fa_head;
+        struct fib_table *tb = res->table;
+        int order = -1, last_idx = -1;
+        struct fib_alias *fa;
+        list_for_each_entry_rcu(fa, fa_head, fa_list) {
+                struct fib_info *next_fi = fa->fa_info;
+                if (next_fi->fib_scope != res->scope ||
+                    fa->fa_type != RTN_UNICAST)
+                        continue;
+                if (next_fi->fib_priority > res->fi->fib_priority)
+                        break;
+                if (!next_fi->fib_nh[0].nh_gw ||
+                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+                        continue;
+                fib_alias_accessed(fa);
+                if (fi == NULL) {
+                        if (next_fi != res->fi)
+                                break;
+                } else if (!fib_detect_death(fi, order, &last_resort,
+                                             &last_idx, tb->tb_default)) {
+                        fib_result_assign(res, fi);
+                        tb->tb_default = order;
+                        goto out;
+                }
+                fi = next_fi;
+                order++;
+        }
+        if (order <= 0 || fi == NULL) {
+                tb->tb_default = -1;
+                goto out;
+        }
+        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+                                tb->tb_default)) {
+                fib_result_assign(res, fi);
+                tb->tb_default = order;
+                goto out;
+        }
+        if (last_idx >= 0)
+                fib_result_assign(res, last_resort);
+        tb->tb_default = last_idx;
+out:
+        return;
+}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 /*
-   Dead device goes up. We wake up dead nexthops.
+ * Dead device goes up. We wake up dead nexthops.
-   It takes sense only on multipath routes.
+ * It takes sense only on multipath routes.
 */
 int fib_sync_up(struct net_device *dev)
 {
        struct fib_info *prev_fi;
@@ -1123,7 +1142,7 @@ int fib_sync_up(struct net_device *dev)
        struct fib_nh *nh;
        int ret;
-        if (!(dev->flags&IFF_UP))
+        if (!(dev->flags & IFF_UP))
                return 0;
        prev_fi = NULL;
@@ -1142,12 +1161,12 @@ int fib_sync_up(struct net_device *dev)
                prev_fi = fi;
                alive = 0;
                change_nexthops(fi) {
-                        if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+                        if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
                                alive++;
                                continue;
                        }
                        if (nexthop_nh->nh_dev == NULL ||
-                            !(nexthop_nh->nh_dev->flags&IFF_UP))
+                            !(nexthop_nh->nh_dev->flags & IFF_UP))
                                continue;
                        if (nexthop_nh->nh_dev != dev ||
                            !__in_dev_get_rtnl(dev))
@@ -1169,11 +1188,10 @@ int fib_sync_up(struct net_device *dev)
 }
 /*
-   The algorithm is suboptimal, but it provides really
+ * The algorithm is suboptimal, but it provides really
-   fair weighted route distribution.
+ * fair weighted route distribution.
 */
+void fib_select_multipath(struct fib_result *res)
-void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 {
        struct fib_info *fi = res->fi;
        int w;
@@ -1182,7 +1200,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
        if (fi->fib_power <= 0) {
                int power = 0;
                change_nexthops(fi) {
-                        if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+                        if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
                                power += nexthop_nh->nh_weight;
                                nexthop_nh->nh_power = nexthop_nh->nh_weight;
                        }
@@ -1198,15 +1216,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
        /* w should be random number [0..fi->fib_power-1],
-           it is pretty bad approximation.
+         * it is pretty bad approximation.
         */
        w = jiffies % fi->fib_power;
        change_nexthops(fi) {
-                if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+                if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
                    nexthop_nh->nh_power) {
-                        if ((w -= nexthop_nh->nh_power) <= 0) {
+                        w -= nexthop_nh->nh_power;
+                        if (w <= 0) {
                                nexthop_nh->nh_power--;
                                fi->fib_power--;
                                res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4a8e370862bc..58c25ea5a5c1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -12,11 +12,11 @@
 *
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
- * This work is based on the LPC-trie which is originally descibed in:
+ * This work is based on the LPC-trie which is originally described in:
 *
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
@@ -72,6 +72,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -95,7 +96,7 @@ typedef unsigned int t_key;
 #define IS_TNODE(n) (!(n->parent & T_LEAF))
 #define IS_LEAF(n) (n->parent & T_LEAF)
-struct node {
+struct rt_trie_node {
        unsigned long parent;
        t_key key;
 };
@@ -126,7 +127,7 @@ struct tnode {
                struct work_struct work;
                struct tnode *tnode_free;
        };
-        struct node *child[0];
+        struct rt_trie_node __rcu *child[0];
 };
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +152,16 @@ struct trie_stat {
 };
 struct trie {
-        struct node *trie;
+        struct rt_trie_node __rcu *trie;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
        struct trie_use_stats stats;
 #endif
 };
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
+static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
                                  int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 static struct tnode *halve(struct trie *t, struct tnode *tn);
 /* tnodes to free after resize(); protected by RTNL */
@@ -177,43 +178,58 @@ static const int sync_pages = 128;
 static struct kmem_cache *fn_alias_kmem __read_mostly;
 static struct kmem_cache *trie_leaf_kmem __read_mostly;
-static inline struct tnode *node_parent(struct node *node)
+/*
+ * caller must hold RTNL
+ */
+static inline struct tnode *node_parent(const struct rt_trie_node *node)
 {
-        return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
+        unsigned long parent;
+        parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
 }
-static inline struct tnode *node_parent_rcu(struct node *node)
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
 {
-        struct tnode *ret = node_parent(node);
+        unsigned long parent;
+        parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
+                                                           lockdep_rtnl_is_held());
-        return rcu_dereference_check(ret,
+        return (struct tnode *)(parent & ~NODE_TYPE_MASK);
-                                     rcu_read_lock_held() ||
-                                     lockdep_rtnl_is_held());
 }
 /* Same as rcu_assign_pointer
 * but that macro() assumes that value is a pointer.
 */
-static inline void node_set_parent(struct node *node, struct tnode *ptr)
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
 {
        smp_wmb();
        node->parent = (unsigned long)ptr | NODE_TYPE(node);
 }
-static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
+/*
+ * caller must hold RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
 {
        BUG_ON(i >= 1U << tn->bits);
-        return tn->child[i];
+        return rtnl_dereference(tn->child[i]);
 }
-static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
 {
-        struct node *ret = tnode_get_child(tn, i);
+        BUG_ON(i >= 1U << tn->bits);
-        return rcu_dereference_check(ret,
+        return rcu_dereference_rtnl(tn->child[i]);
-                                     rcu_read_lock_held() ||
-                                     lockdep_rtnl_is_held());
 }
 static inline int tnode_child_length(const struct tnode *tn)
@@ -221,12 +237,12 @@ static inline int tnode_child_length(const struct tnode *tn)
        return 1 << tn->bits;
 }
-static inline t_key mask_pfx(t_key k, unsigned short l)
+static inline t_key mask_pfx(t_key k, unsigned int l)
 {
        return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 }
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
 {
        if (offset < KEYLENGTH)
                return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -354,14 +370,9 @@ static inline void free_leaf(struct leaf *l)
        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 }
-static void __leaf_info_free_rcu(struct rcu_head *head)
-{
-        kfree(container_of(head, struct leaf_info, rcu));
-}
 static inline void free_leaf_info(struct leaf_info *leaf)
 {
-        call_rcu(&leaf->rcu, __leaf_info_free_rcu);
+        kfree_rcu(leaf, rcu);
 }
 static struct tnode *tnode_alloc(size_t size)
@@ -369,7 +380,7 @@ static struct tnode *tnode_alloc(size_t size)
        if (size <= PAGE_SIZE)
                return kzalloc(size, GFP_KERNEL);
        else
-                return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+                return vzalloc(size);
 }
 static void __tnode_vfree(struct work_struct *arg)
@@ -382,7 +393,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
 {
        struct tnode *tn = container_of(head, struct tnode, rcu);
        size_t size = sizeof(struct tnode) +
-                      (sizeof(struct node *) << tn->bits);
+                      (sizeof(struct rt_trie_node *) << tn->bits);
        if (size <= PAGE_SIZE)
                kfree(tn);
@@ -406,7 +417,7 @@ static void tnode_free_safe(struct tnode *tn)
        tn->tnode_free = tnode_free_head;
        tnode_free_head = tn;
        tnode_free_size += sizeof(struct tnode) +
-                           (sizeof(struct node *) << tn->bits);
+                           (sizeof(struct rt_trie_node *) << tn->bits);
 }
 static void tnode_free_flush(void)
@@ -447,7 +458,7 @@ static struct leaf_info *leaf_info_new(int plen)
 static struct tnode *tnode_new(t_key key, int pos, int bits)
 {
-        size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
+        size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
        struct tnode *tn = tnode_alloc(sz);
        if (tn) {
@@ -459,8 +470,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
                tn->empty_children = 1<<bits;
        }
-        pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
+        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
-                 (unsigned long) (sizeof(struct node) << bits));
+                 sizeof(struct rt_trie_node) << bits);
        return tn;
 }
@@ -469,7 +480,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
 {
        if (n == NULL || IS_LEAF(n))
                return 0;
@@ -478,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
 }
 static inline void put_child(struct trie *t, struct tnode *tn, int i,
-                             struct node *n)
+                             struct rt_trie_node *n)
 {
        tnode_put_child_reorg(tn, i, n, -1);
 }
@@ -488,10 +499,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
  * Update the value of full_children and empty_children.
  */
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
                                  int wasfull)
 {
-        struct node *chi = tn->child[i];
+        struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
        int isfull;
        BUG_ON(i >= 1<<tn->bits);
@@ -519,7 +530,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 }
 #define MAX_WORK 10
-static struct node *resize(struct trie *t, struct tnode *tn)
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
 {
        int i;
        struct tnode *old_tn;
@@ -609,11 +620,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        /* Keep root node larger  */
-        if (!node_parent((struct node*) tn)) {
+        if (!node_parent((struct rt_trie_node *)tn)) {
                inflate_threshold_use = inflate_threshold_root;
                halve_threshold_use = halve_threshold_root;
-        }
+        } else {
-        else {
                inflate_threshold_use = inflate_threshold;
                halve_threshold_use = halve_threshold;
        }
@@ -639,8 +649,8 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        check_tnode(tn);
        /* Return if at least one inflate is run */
-        if( max_work != MAX_WORK)
+        if (max_work != MAX_WORK)
-                return (struct node *) tn;
+                return (struct rt_trie_node *) tn;
        /*
         * Halve as long as the number of empty children in this
@@ -668,9 +678,9 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        if (tn->empty_children == tnode_child_length(tn) - 1) {
 one_child:
                for (i = 0; i < tnode_child_length(tn); i++) {
-                        struct node *n;
+                        struct rt_trie_node *n;
-                        n = tn->child[i];
+                        n = rtnl_dereference(tn->child[i]);
                        if (!n)
                                continue;
@@ -681,7 +691,21 @@ one_child:
                        return n;
                }
        }
-        return (struct node *) tn;
+        return (struct rt_trie_node *) tn;
+}
+static void tnode_clean_free(struct tnode *tn)
+{
+        int i;
+        struct tnode *tofree;
+        for (i = 0; i < tnode_child_length(tn); i++) {
+                tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+                if (tofree)
+                        tnode_free(tofree);
+        }
+        tnode_free(tn);
 }
 static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -728,14 +752,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                                goto nomem;
                        }
-                        put_child(t, tn, 2*i, (struct node *) left);
+                        put_child(t, tn, 2*i, (struct rt_trie_node *) left);
-                        put_child(t, tn, 2*i+1, (struct node *) right);
+                        put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
                }
        }
        for (i = 0; i < olen; i++) {
                struct tnode *inode;
-                struct node *node = tnode_get_child(oldtnode, i);
+                struct rt_trie_node *node = tnode_get_child(oldtnode, i);
                struct tnode *left, *right;
                int size, j;
@@ -760,8 +784,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                inode = (struct tnode *) node;
                if (inode->bits == 1) {
-                        put_child(t, tn, 2*i, inode->child[0]);
+                        put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
-                        put_child(t, tn, 2*i+1, inode->child[1]);
+                        put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
                        tnode_free_safe(inode);
                        continue;
@@ -802,8 +826,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                size = tnode_child_length(left);
                for (j = 0; j < size; j++) {
-                        put_child(t, left, j, inode->child[j]);
+                        put_child(t, left, j, rtnl_dereference(inode->child[j]));
-                        put_child(t, right, j, inode->child[j + size]);
+                        put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
                }
                put_child(t, tn, 2*i, resize(t, left));
                put_child(t, tn, 2*i+1, resize(t, right));
@@ -813,24 +837,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
        tnode_free_safe(oldtnode);
        return tn;
 nomem:
-        {
+        tnode_clean_free(tn);
-                int size = tnode_child_length(tn);
+        return ERR_PTR(-ENOMEM);
-                int j;
-                for (j = 0; j < size; j++)
-                        if (tn->child[j])
-                                tnode_free((struct tnode *)tn->child[j]);
-                tnode_free(tn);
-                return ERR_PTR(-ENOMEM);
-        }
 }
 static struct tnode *halve(struct trie *t, struct tnode *tn)
 {
        struct tnode *oldtnode = tn;
-        struct node *left, *right;
+        struct rt_trie_node *left, *right;
        int i;
        int olen = tnode_child_length(tn);
@@ -861,7 +875,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
                        if (!newn)
                                goto nomem;
-                        put_child(t, tn, i/2, (struct node *)newn);
+                        put_child(t, tn, i/2, (struct rt_trie_node *)newn);
                }
        }
@@ -895,18 +909,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
        tnode_free_safe(oldtnode);
        return tn;
 nomem:
-        {
+        tnode_clean_free(tn);
-                int size = tnode_child_length(tn);
+        return ERR_PTR(-ENOMEM);
-                int j;
-                for (j = 0; j < size; j++)
-                        if (tn->child[j])
-                                tnode_free((struct tnode *)tn->child[j]);
-                tnode_free(tn);
-                return ERR_PTR(-ENOMEM);
-        }
 }
 /* readside must use rcu_read_lock currently dump routines
@@ -963,12 +967,10 @@ fib_find_node(struct trie *t, u32 key)
 {
        int pos;
        struct tnode *tn;
-        struct node *n;
+        struct rt_trie_node *n;
        pos = 0;
-        n = rcu_dereference_check(t->trie,
+        n = rcu_dereference_rtnl(t->trie);
-                                  rcu_read_lock_held() ||
-                                  lockdep_rtnl_is_held());
        while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
                tn = (struct tnode *) n;
@@ -1000,17 +1002,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
        key = tn->key;
-        while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
+        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
                tn = (struct tnode *) resize(t, (struct tnode *)tn);
                tnode_put_child_reorg((struct tnode *)tp, cindex,
-                                      (struct node *)tn, wasfull);
+                                      (struct rt_trie_node *)tn, wasfull);
-                tp = node_parent((struct node *) tn);
+                tp = node_parent((struct rt_trie_node *) tn);
                if (!tp)
-                        rcu_assign_pointer(t->trie, (struct node *)tn);
+                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
                tnode_free_flush();
                if (!tp)
@@ -1022,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
        if (IS_TNODE(tn))
                tn = (struct tnode *)resize(t, (struct tnode *)tn);
-        rcu_assign_pointer(t->trie, (struct node *)tn);
+        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
        tnode_free_flush();
 }
@@ -1032,7 +1034,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
 {
        int pos, newpos;
        struct tnode *tp = NULL, *tn = NULL;
-        struct node *n;
+        struct rt_trie_node *n;
        struct leaf *l;
        int missbit;
        struct list_head *fa_head = NULL;
@@ -1040,7 +1042,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
        t_key cindex;
        pos = 0;
-        n = t->trie;
+        n = rtnl_dereference(t->trie);
        /* If we point to NULL, stop. Either the tree is empty and we should
         * just put a new leaf in if, or we have reached an empty child slot,
@@ -1118,10 +1120,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
        if (t->trie && n == NULL) {
                /* Case 2: n is NULL, and will just insert a new leaf */
-                node_set_parent((struct node *)l, tp);
+                node_set_parent((struct rt_trie_node *)l, tp);
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-                put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+                put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
        } else {
                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
                /*
@@ -1148,18 +1150,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
                        return NULL;
                }
-                node_set_parent((struct node *)tn, tp);
+                node_set_parent((struct rt_trie_node *)tn, tp);
                missbit = tkey_extract_bits(key, newpos, 1);
-                put_child(t, tn, missbit, (struct node *)l);
+                put_child(t, tn, missbit, (struct rt_trie_node *)l);
                put_child(t, tn, 1-missbit, n);
                if (tp) {
                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                        put_child(t, (struct tnode *)tp, cindex,
-                                  (struct node *)tn);
+                                  (struct rt_trie_node *)tn);
                } else {
-                        rcu_assign_pointer(t->trie, (struct node *)tn);
+                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
                        tp = tn;
                }
        }
@@ -1252,7 +1254,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
                        if (fa->fa_info->fib_priority != fi->fib_priority)
                                break;
                        if (fa->fa_type == cfg->fc_type &&
-                            fa->fa_scope == cfg->fc_scope &&
                            fa->fa_info == fi) {
                                fa_match = fa;
                                break;
@@ -1278,7 +1279,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
                        new_fa->fa_tos = fa->fa_tos;
                        new_fa->fa_info = fi;
                        new_fa->fa_type = cfg->fc_type;
-                        new_fa->fa_scope = cfg->fc_scope;
                        state = fa->fa_state;
                        new_fa->fa_state = state & ~FA_S_ACCESSED;
@@ -1315,7 +1315,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
        new_fa->fa_info = fi;
        new_fa->fa_tos = tos;
        new_fa->fa_type = cfg->fc_type;
-        new_fa->fa_scope = cfg->fc_scope;
        new_fa->fa_state = 0;
        /*
         * Insert new entry to the list.
@@ -1329,6 +1328,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
                }
        }
+        if (!plen)
+                tb->tb_num_default++;
        list_add_tail_rcu(&new_fa->fa_list,
                          (fa ? &fa->fa_list : fa_head));
@@ -1347,52 +1349,86 @@ err:
 }
 /* should be called with rcu_read_lock */
-static int check_leaf(struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
-                      t_key key,  const struct flowi *flp,
+                      t_key key,  const struct flowi4 *flp,
-                      struct fib_result *res)
+                      struct fib_result *res, int fib_flags)
 {
        struct leaf_info *li;
        struct hlist_head *hhead = &l->list;
        struct hlist_node *node;
        hlist_for_each_entry_rcu(li, node, hhead, hlist) {
-                int err;
+                struct fib_alias *fa;
                int plen = li->plen;
                __be32 mask = inet_make_mask(plen);
                if (l->key != (key & ntohl(mask)))
                        continue;
-                err = fib_semantic_match(&li->falh, flp, res, plen);
+                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+                        struct fib_info *fi = fa->fa_info;
+                        int nhsel, err;
+                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+                                continue;
+                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
+                                continue;
+                        fib_alias_accessed(fa);
+                        err = fib_props[fa->fa_type].error;
+                        if (err) {
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-                if (err <= 0)
+                                t->stats.semantic_match_passed++;
-                        t->stats.semantic_match_passed++;
+#endif
-                else
+                                return err;
-                        t->stats.semantic_match_miss++;
+                        }
+                        if (fi->fib_flags & RTNH_F_DEAD)
+                                continue;
+                        for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+                                const struct fib_nh *nh = &fi->fib_nh[nhsel];
+                                if (nh->nh_flags & RTNH_F_DEAD)
+                                        continue;
+                                if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+                                        continue;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+                                t->stats.semantic_match_passed++;
+#endif
+                                res->prefixlen = plen;
+                                res->nh_sel = nhsel;
+                                res->type = fa->fa_type;
+                                res->scope = fa->fa_info->fib_scope;
+                                res->fi = fi;
+                                res->table = tb;
+                                res->fa_head = &li->falh;
+                                if (!(fib_flags & FIB_LOOKUP_NOREF))
+                                        atomic_inc(&res->fi->fib_clntref);
+                                return 0;
+                        }
+                }
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+                t->stats.semantic_match_miss++;
 #endif
-                if (err <= 0)
-                        return err;
        }
        return 1;
 }
-int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
-                     struct fib_result *res)
+                     struct fib_result *res, int fib_flags)
 {
        struct trie *t = (struct trie *) tb->tb_data;
        int ret;
-        struct node *n;
+        struct rt_trie_node *n;
        struct tnode *pn;
-        int pos, bits;
+        unsigned int pos, bits;
-        t_key key = ntohl(flp->fl4_dst);
+        t_key key = ntohl(flp->daddr);
-        int chopped_off;
+        unsigned int chopped_off;
        t_key cindex = 0;
-        int current_prefix_length = KEYLENGTH;
+        unsigned int current_prefix_length = KEYLENGTH;
        struct tnode *cn;
-        t_key node_prefix, key_prefix, pref_mismatch;
+        t_key pref_mismatch;
-        int mp;
        rcu_read_lock();
@@ -1406,7 +1442,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
        /* Just a leaf? */
        if (IS_LEAF(n)) {
-                ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+                ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
                goto found;
        }
@@ -1431,7 +1467,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
                }
                if (IS_LEAF(n)) {
-                        ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+                        ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
                        if (ret > 0)
                                goto backtrace;
                        goto found;
@@ -1507,10 +1543,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
                 * matching prefix.
                 */
-                node_prefix = mask_pfx(cn->key, cn->pos);
+                pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
-                key_prefix = mask_pfx(key, cn->pos);
-                pref_mismatch = key_prefix^node_prefix;
-                mp = 0;
                /*
                 * In short: If skipped bits in this node do not match
@@ -1518,13 +1551,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
                 * state.directly.
                 */
                if (pref_mismatch) {
-                        while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+                        int mp = KEYLENGTH - fls(pref_mismatch);
-                                mp++;
-                                pref_mismatch = pref_mismatch << 1;
-                        }
-                        key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
-                        if (key_prefix != 0)
+                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
                                goto backtrace;
                        if (current_prefix_length >= cn->pos)
@@ -1556,7 +1585,7 @@ backtrace:
                if (chopped_off <= pn->bits) {
                        cindex &= ~(1 << (chopped_off-1));
                } else {
-                        struct tnode *parent = node_parent_rcu((struct node *) pn);
+                        struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
                        if (!parent)
                                goto failed;
@@ -1583,7 +1612,7 @@ found:
 */
 static void trie_leaf_remove(struct trie *t, struct leaf *l)
 {
-        struct tnode *tp = node_parent((struct node *) l);
+        struct tnode *tp = node_parent((struct rt_trie_node *) l);
        pr_debug("entering trie_leaf_remove(%p)\n", l);
@@ -1644,7 +1673,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
                if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
                    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
-                     fa->fa_scope == cfg->fc_scope) &&
+                     fa->fa_info->fib_scope == cfg->fc_scope) &&
+                    (!cfg->fc_prefsrc ||
+                     fi->fib_prefsrc == cfg->fc_prefsrc) &&
                    (!cfg->fc_protocol ||
                     fi->fib_protocol == cfg->fc_protocol) &&
                    fib_nh_match(cfg, fi) == 0) {
@@ -1665,6 +1696,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
        list_del_rcu(&fa->fa_list);
+        if (!plen)
+                tb->tb_num_default--;
        if (list_empty(fa_head)) {
                hlist_del_rcu(&li->hlist);
                free_leaf_info(li);
@@ -1721,7 +1755,7 @@ static int trie_flush_leaf(struct leaf *l)
 * Scan for the next right leaf starting at node p->child[idx]
 * Since we have back pointer, no recursion necessary.
 */
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
 {
        do {
                t_key idx;
@@ -1737,7 +1771,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
                                continue;
                        if (IS_LEAF(c)) {
-                                prefetch(p->child[idx]);
+                                prefetch(rcu_dereference_rtnl(p->child[idx]));
                                return (struct leaf *) c;
                        }
@@ -1747,17 +1781,15 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
                }
                /* Node empty, walk back up to parent */
-                c = (struct node *) p;
+                c = (struct rt_trie_node *) p;
-        } while ( (p = node_parent_rcu(c)) != NULL);
+        } while ((p = node_parent_rcu(c)) != NULL);
        return NULL; /* Root of trie */
 }
 static struct leaf *trie_firstleaf(struct trie *t)
 {
-        struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
+        struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
-                                                        rcu_read_lock_held() ||
-                                                        lockdep_rtnl_is_held());
        if (!n)
                return NULL;
@@ -1770,7 +1802,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
 static struct leaf *trie_nextleaf(struct leaf *l)
 {
-        struct node *c = (struct node *) l;
+        struct rt_trie_node *c = (struct rt_trie_node *) l;
        struct tnode *p = node_parent_rcu(c);
        if (!p)
@@ -1814,77 +1846,9 @@ int fib_table_flush(struct fib_table *tb)
        return found;
 }
-void fib_table_select_default(struct fib_table *tb,
+void fib_free_table(struct fib_table *tb)
-                              const struct flowi *flp,
-                              struct fib_result *res)
 {
-        struct trie *t = (struct trie *) tb->tb_data;
+        kfree(tb);
-        int order, last_idx;
-        struct fib_info *fi = NULL;
-        struct fib_info *last_resort;
-        struct fib_alias *fa = NULL;
-        struct list_head *fa_head;
-        struct leaf *l;
-        last_idx = -1;
-        last_resort = NULL;
-        order = -1;
-        rcu_read_lock();
-        l = fib_find_node(t, 0);
-        if (!l)
-                goto out;
-        fa_head = get_fa_head(l, 0);
-        if (!fa_head)
-                goto out;
-        if (list_empty(fa_head))
-                goto out;
-        list_for_each_entry_rcu(fa, fa_head, fa_list) {
-                struct fib_info *next_fi = fa->fa_info;
-                if (fa->fa_scope != res->scope ||
-                    fa->fa_type != RTN_UNICAST)
-                        continue;
-                if (next_fi->fib_priority > res->fi->fib_priority)
-                        break;
-                if (!next_fi->fib_nh[0].nh_gw ||
-                    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
-                        continue;
-                fa->fa_state |= FA_S_ACCESSED;
-                if (fi == NULL) {
-                        if (next_fi != res->fi)
-                                break;
-                } else if (!fib_detect_death(fi, order, &last_resort,
-                                             &last_idx, tb->tb_default)) {
-                        fib_result_assign(res, fi);
-                        tb->tb_default = order;
-                        goto out;
-                }
-                fi = next_fi;
-                order++;
-        }
-        if (order <= 0 || fi == NULL) {
-                tb->tb_default = -1;
-                goto out;
-        }
-        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-                                tb->tb_default)) {
-                fib_result_assign(res, fi);
-                tb->tb_default = order;
-                goto out;
-        }
-        if (last_idx >= 0)
-                fib_result_assign(res, last_resort);
-        tb->tb_default = last_idx;
-out:
-        rcu_read_unlock();
 }
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
@@ -1911,7 +1875,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
                                  RTM_NEWROUTE,
                                  tb->tb_id,
                                  fa->fa_type,
-                                  fa->fa_scope,
                                  xkey,
                                  plen,
                                  fa->fa_tos,
@@ -2001,7 +1964,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
        return skb->len;
 }
-void __init fib_hash_init(void)
+void __init fib_trie_init(void)
 {
        fn_alias_kmem = kmem_cache_create("ip_fib_alias",
                                          sizeof(struct fib_alias),
@@ -2014,8 +1977,7 @@ void __init fib_hash_init(void)
 }
-/* Fix more generic FIB names for init later */
+struct fib_table *fib_trie_table(u32 id)
-struct fib_table *fib_hash_table(u32 id)
 {
        struct fib_table *tb;
        struct trie *t;
@@ -2027,13 +1989,11 @@ struct fib_table *fib_hash_table(u32 id)
        tb->tb_id = id;
        tb->tb_default = -1;
+        tb->tb_num_default = 0;
        t = (struct trie *) tb->tb_data;
        memset(t, 0, sizeof(*t));
-        if (id == RT_TABLE_LOCAL)
-                pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
        return tb;
 }
@@ -2043,14 +2003,14 @@ struct fib_trie_iter {
        struct seq_net_private p;
        struct fib_table *tb;
        struct tnode *tnode;
-        unsigned index;
+        unsigned int index;
-        unsigned depth;
+        unsigned int depth;
 };
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
 {
        struct tnode *tn = iter->tnode;
-        unsigned cindex = iter->index;
+        unsigned int cindex = iter->index;
        struct tnode *p;
        /* A single entry routing table */
@@ -2061,7 +2021,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
                 iter->tnode, iter->index, iter->depth);
 rescan:
        while (cindex < (1<<tn->bits)) {
-                struct node *n = tnode_get_child_rcu(tn, cindex);
+                struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
                if (n) {
                        if (IS_LEAF(n)) {
@@ -2080,7 +2040,7 @@ rescan:
        }
        /* Current node exhausted, pop back up */
-        p = node_parent_rcu((struct node *)tn);
+        p = node_parent_rcu((struct rt_trie_node *)tn);
        if (p) {
                cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
                tn = p;
@@ -2092,10 +2052,10 @@ rescan:
        return NULL;
 }
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
                                       struct trie *t)
 {
-        struct node *n;
+        struct rt_trie_node *n;
        if (!t)
                return NULL;
@@ -2119,7 +2079,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 {
-        struct node *n;
+        struct rt_trie_node *n;
        struct fib_trie_iter iter;
        memset(s, 0, sizeof(*s));
@@ -2159,7 +2119,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 */
 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 {
-        unsigned i, max, pointers, bytes, avdepth;
+        unsigned int i, max, pointers, bytes, avdepth;
        if (stat->leaves)
                avdepth = stat->totdepth*100 / stat->leaves;
@@ -2192,7 +2152,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
        seq_putc(seq, '\n');
        seq_printf(seq, "\tPointers: %u\n", pointers);
-        bytes += sizeof(struct node *) * pointers;
+        bytes += sizeof(struct rt_trie_node *) * pointers;
        seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
        seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
 }
@@ -2273,7 +2233,7 @@ static const struct file_operations fib_triestat_fops = {
        .release = single_release_net,
 };
-static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
 {
        struct fib_trie_iter *iter = seq->private;
        struct net *net = seq_file_net(seq);
@@ -2286,7 +2246,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
                struct fib_table *tb;
                hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
-                        struct node *n;
+                        struct rt_trie_node *n;
                        for (n = fib_trie_get_first(iter,
                                                    (struct trie *) tb->tb_data);
@@ -2315,7 +2275,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        struct fib_table *tb = iter->tb;
        struct hlist_node *tb_node;
        unsigned int h;
-        struct node *n;
+        struct rt_trie_node *n;
        ++*pos;
        /* next node in same table */
@@ -2325,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        /* walk rest of this hash chain */
        h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
-        while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
+        while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
                tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
                n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
                if (n)
@@ -2356,7 +2316,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 static void seq_indent(struct seq_file *seq, int n)
 {
-        while (n-- > 0) seq_puts(seq, "   ");
+        while (n-- > 0)
+                seq_puts(seq, "   ");
 }
 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2388,7 +2349,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
        [RTN_XRESOLVE] = "XRESOLVE",
 };
-static inline const char *rtn_type(char *buf, size_t len, unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
 {
        if (t < __RTN_MAX && rtn_type_names[t])
                return rtn_type_names[t];
@@ -2400,7 +2361,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned t)
 static int fib_trie_seq_show(struct seq_file *seq, void *v)
 {
        const struct fib_trie_iter *iter = seq->private;
-        struct node *n = v;
+        struct rt_trie_node *n = v;
        if (!node_parent_rcu(n))
                fib_table_print(seq, iter->tb);
@@ -2432,7 +2393,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
                                seq_indent(seq, iter->depth+1);
                                seq_printf(seq, "  /%d %s %s", li->plen,
                                           rtn_scope(buf1, sizeof(buf1),
-                                                     fa->fa_scope),
+                                                     fa->fa_info->fib_scope),
                                           rtn_type(buf2, sizeof(buf2),
                                                    fa->fa_type));
                                if (fa->fa_tos)
@@ -2544,13 +2505,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
        rcu_read_unlock();
 }
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
 {
-        static unsigned type2flags[RTN_MAX + 1] = {
+        unsigned int flags = 0;
-                [7] = RTF_REJECT, [8] = RTF_REJECT,
-        };
-        unsigned flags = type2flags[type];
+        if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+                flags = RTF_REJECT;
        if (fi && fi->fib_nh->nh_gw)
                flags |= RTF_GATEWAY;
        if (mask == htonl(0xFFFFFFFF))
@@ -2562,7 +2522,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
 /*
 *      This outputs /proc/net/route.
 *      The format of the file is not supposed to be changed
- *      and needs to be same as fib_hash output to avoid breaking
+ *      and needs to be same as fib_hash output to avoid breaking
 *      legacy utilities
 */
 static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2587,7 +2547,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
                list_for_each_entry_rcu(fa, &li->falh, fa_list) {
                        const struct fib_info *fi = fa->fa_info;
-                        unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+                        unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
                        int len;
                        if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..c6933f2ea310
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,152 @@
+/*
+ *      GRE over IPv4 demultiplexer driver
+ *
+ *      Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static DEFINE_SPINLOCK(gre_proto_lock);
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+        if (version >= GREPROTO_MAX)
+                goto err_out;
+        spin_lock(&gre_proto_lock);
+        if (gre_proto[version])
+                goto err_out_unlock;
+        rcu_assign_pointer(gre_proto[version], proto);
+        spin_unlock(&gre_proto_lock);
+        return 0;
+err_out_unlock:
+        spin_unlock(&gre_proto_lock);
+err_out:
+        return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+        if (version >= GREPROTO_MAX)
+                goto err_out;
+        spin_lock(&gre_proto_lock);
+        if (rcu_dereference_protected(gre_proto[version],
+                        lockdep_is_held(&gre_proto_lock)) != proto)
+                goto err_out_unlock;
+        rcu_assign_pointer(gre_proto[version], NULL);
+        spin_unlock(&gre_proto_lock);
+        synchronize_rcu();
+        return 0;
+err_out_unlock:
+        spin_unlock(&gre_proto_lock);
+err_out:
+        return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+static int gre_rcv(struct sk_buff *skb)
+{
+        const struct gre_protocol *proto;
+        u8 ver;
+        int ret;
+        if (!pskb_may_pull(skb, 12))
+                goto drop;
+        ver = skb->data[1]&0x7f;
+        if (ver >= GREPROTO_MAX)
+                goto drop;
+        rcu_read_lock();
+        proto = rcu_dereference(gre_proto[ver]);
+        if (!proto || !proto->handler)
+                goto drop_unlock;
+        ret = proto->handler(skb);
+        rcu_read_unlock();
+        return ret;
+drop_unlock:
+        rcu_read_unlock();
+drop:
+        kfree_skb(skb);
+        return NET_RX_DROP;
+}
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+        const struct gre_protocol *proto;
+        u8 ver;
+        if (!pskb_may_pull(skb, 12))
+                goto drop;
+        ver = skb->data[1]&0x7f;
+        if (ver >= GREPROTO_MAX)
+                goto drop;
+        rcu_read_lock();
+        proto = rcu_dereference(gre_proto[ver]);
+        if (!proto || !proto->err_handler)
+                goto drop_unlock;
+        proto->err_handler(skb, info);
+        rcu_read_unlock();
+        return;
+drop_unlock:
+        rcu_read_unlock();
+drop:
+        kfree_skb(skb);
+}
+static const struct net_protocol net_gre_protocol = {
+        .handler     = gre_rcv,
+        .err_handler = gre_err,
+        .netns_ok    = 1,
+};
+static int __init gre_init(void)
+{
+        pr_info("GRE over IPv4 demultiplexor driver");
+        if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+                pr_err("gre: can't add protocol\n");
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void __exit gre_exit(void)
+{
+        inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+module_init(gre_init);
+module_exit(gre_exit);
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba5..5395e45dcce6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -83,6 +83,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/raw.h>
+#include <net/ping.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <linux/errno.h>
@@ -108,8 +109,7 @@ struct icmp_bxm {
                __be32         times[3];
        } data;
        int head_len;
-        struct ip_options replyopts;
+        struct ip_options_data replyopts;
-        unsigned char  optbuf[40];
 };
 /* An array of errno for error messages from dest unreach. */
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
 *      Send an ICMP frame.
 */
-/*
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
- *      Check transmit rate limitation for given message.
+                                      struct flowi4 *fl4, int type, int code)
- *      The rate information is held in the destination cache now.
- *      This function is generic and could be used for other purposes
- *      too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- *      Note that the same dst_entry fields are modified by functions in
- *      route.c too, but these work for packet destinations while xrlim_allow
- *      works for icmp destinations. This means the rate limiting information
- *      for one "ip object" is shared - and these ICMPs are twice limited:
- *      by source and by destination.
- *
- *      RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- *                        SHOULD allow setting of rate limits
- *
- *      Shared between ICMPv4 and ICMPv6.
- */
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
-{
-        unsigned long now, token = dst->rate_tokens;
-        int rc = 0;
-        now = jiffies;
-        token += now - dst->rate_last;
-        dst->rate_last = now;
-        if (token > XRLIM_BURST_FACTOR * timeout)
-                token = XRLIM_BURST_FACTOR * timeout;
-        if (token >= timeout) {
-                token -= timeout;
-                rc = 1;
-        }
-        dst->rate_tokens = token;
-        return rc;
-}
-EXPORT_SYMBOL(xrlim_allow);
-static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
-                int type, int code)
 {
        struct dst_entry *dst = &rt->dst;
-        int rc = 1;
+        bool rc = true;
        if (type > NR_ICMP_TYPES)
                goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
                goto out;
        /* Limit if icmp type is enabled in ratemask. */
-        if ((1 << type) & net->ipv4.sysctl_icmp_ratemask)
+        if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
-                rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit);
+                if (!rt->peer)
+                        rt_bind_peer(rt, fl4->daddr, 1);
+                rc = inet_peer_xrlim_allow(rt->peer,
+                                           net->ipv4.sysctl_icmp_ratelimit);
+        }
 out:
        return rc;
 }
@@ -324,13 +291,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
 }
 static void icmp_push_reply(struct icmp_bxm *icmp_param,
+                            struct flowi4 *fl4,
                            struct ipcm_cookie *ipc, struct rtable **rt)
 {
        struct sock *sk;
        struct sk_buff *skb;
        sk = icmp_sk(dev_net((*rt)->dst.dev));
-        if (ip_append_data(sk, icmp_glue_bits, icmp_param,
+        if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
                           icmp_param->data_len+icmp_param->head_len,
                           icmp_param->head_len,
                           ipc, rt, MSG_DONTWAIT) < 0) {
@@ -349,7 +317,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
                                                 icmp_param->head_len, csum);
                icmph->checksum = csum_fold(csum);
                skb->ip_summed = CHECKSUM_NONE;
-                ip_push_pending_frames(sk);
+                ip_push_pending_frames(sk, fl4);
        }
 }
@@ -362,11 +330,12 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        struct ipcm_cookie ipc;
        struct rtable *rt = skb_rtable(skb);
        struct net *net = dev_net(rt->dst.dev);
+        struct flowi4 fl4;
        struct sock *sk;
        struct inet_sock *inet;
        __be32 daddr;
-        if (ip_options_echo(&icmp_param->replyopts, skb))
+        if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
                return;
        sk = icmp_xmit_lock(net);
@@ -377,32 +346,120 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        icmp_param->data.icmph.checksum = 0;
        inet->tos = ip_hdr(skb)->tos;
-        daddr = ipc.addr = rt->rt_src;
+        daddr = ipc.addr = ip_hdr(skb)->saddr;
        ipc.opt = NULL;
-        ipc.shtx.flags = 0;
+        ipc.tx_flags = 0;
-        if (icmp_param->replyopts.optlen) {
+        if (icmp_param->replyopts.opt.opt.optlen) {
-                ipc.opt = &icmp_param->replyopts;
+                ipc.opt = &icmp_param->replyopts.opt;
-                if (ipc.opt->srr)
+                if (ipc.opt->opt.srr)
-                        daddr = icmp_param->replyopts.faddr;
+                        daddr = icmp_param->replyopts.opt.opt.faddr;
        }
-        {
+        memset(&fl4, 0, sizeof(fl4));
-                struct flowi fl = { .nl_u = { .ip4_u =
+        fl4.daddr = daddr;
-                                              { .daddr = daddr,
+        fl4.saddr = rt->rt_spec_dst;
-                                                .saddr = rt->rt_spec_dst,
+        fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
-                                                .tos = RT_TOS(ip_hdr(skb)->tos) } },
+        fl4.flowi4_proto = IPPROTO_ICMP;
-                                    .proto = IPPROTO_ICMP };
+        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-                security_skb_classify_flow(skb, &fl);
+        rt = ip_route_output_key(net, &fl4);
-                if (ip_route_output_key(net, &rt, &fl))
+        if (IS_ERR(rt))
-                        goto out_unlock;
+                goto out_unlock;
-        }
+        if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
-        if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
                               icmp_param->data.icmph.code))
-                icmp_push_reply(icmp_param, &ipc, &rt);
+                icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
        ip_rt_put(rt);
 out_unlock:
        icmp_xmit_unlock(sk);
 }
+static struct rtable *icmp_route_lookup(struct net *net,
+                                        struct flowi4 *fl4,
+                                        struct sk_buff *skb_in,
+                                        const struct iphdr *iph,
+                                        __be32 saddr, u8 tos,
+                                        int type, int code,
+                                        struct icmp_bxm *param)
+{
+        struct rtable *rt, *rt2;
+        int err;
+        memset(fl4, 0, sizeof(*fl4));
+        fl4->daddr = (param->replyopts.opt.opt.srr ?
+                      param->replyopts.opt.opt.faddr : iph->saddr);
+        fl4->saddr = saddr;
+        fl4->flowi4_tos = RT_TOS(tos);
+        fl4->flowi4_proto = IPPROTO_ICMP;
+        fl4->fl4_icmp_type = type;
+        fl4->fl4_icmp_code = code;
+        security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+        rt = __ip_route_output_key(net, fl4);
+        if (IS_ERR(rt))
+                return rt;
+        /* No need to clone since we're just using its address. */
+        rt2 = rt;
+        rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+                                           flowi4_to_flowi(fl4), NULL, 0);
+        if (!IS_ERR(rt)) {
+                if (rt != rt2)
+                        return rt;
+        } else if (PTR_ERR(rt) == -EPERM) {
+                rt = NULL;
+        } else
+                return rt;
+        err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET);
+        if (err)
+                goto relookup_failed;
+        if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) {
+                rt2 = __ip_route_output_key(net, fl4);
+                if (IS_ERR(rt2))
+                        err = PTR_ERR(rt2);
+        } else {
+                struct flowi4 fl4_2 = {};
+                unsigned long orefdst;
+                fl4_2.daddr = fl4->saddr;
+                rt2 = ip_route_output_key(net, &fl4_2);
+                if (IS_ERR(rt2)) {
+                        err = PTR_ERR(rt2);
+                        goto relookup_failed;
+                }
+                /* Ugh! */
+                orefdst = skb_in->_skb_refdst; /* save old refdst */
+                err = ip_route_input(skb_in, fl4->daddr, fl4->saddr,
+                                     RT_TOS(tos), rt2->dst.dev);
+                dst_release(&rt2->dst);
+                rt2 = skb_rtable(skb_in);
+                skb_in->_skb_refdst = orefdst; /* restore old refdst */
+        }
+        if (err)
+                goto relookup_failed;
+        rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+                                            flowi4_to_flowi(fl4), NULL,
+                                            XFRM_LOOKUP_ICMP);
+        if (!IS_ERR(rt2)) {
+                dst_release(&rt->dst);
+                rt = rt2;
+        } else if (PTR_ERR(rt2) == -EPERM) {
+                if (rt)
+                        dst_release(&rt->dst);
+                return rt2;
+        } else {
+                err = PTR_ERR(rt2);
+                goto relookup_failed;
+        }
+        return rt;
+relookup_failed:
+        if (rt)
+                return rt;
+        return ERR_PTR(err);
+}
 /*
 *      Send an ICMP message in response to a situation
@@ -422,6 +479,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
        struct icmp_bxm icmp_param;
        struct rtable *rt = skb_rtable(skb_in);
        struct ipcm_cookie ipc;
+        struct flowi4 fl4;
        __be32 saddr;
        u8  tos;
        struct net *net;
@@ -506,9 +564,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
                struct net_device *dev = NULL;
                rcu_read_lock();
-                if (rt->fl.iif &&
+                if (rt_is_input_route(rt) &&
-                        net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+                    net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
-                        dev = dev_get_by_index_rcu(net, rt->fl.iif);
+                        dev = dev_get_by_index_rcu(net, rt->rt_iif);
                if (dev)
                        saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -521,7 +579,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
                                           IPTOS_PREC_INTERNETCONTROL) :
                                          iph->tos;
-        if (ip_options_echo(&icmp_param.replyopts, skb_in))
+        if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
                goto out_unlock;
@@ -537,96 +595,15 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
        icmp_param.offset = skb_network_offset(skb_in);
        inet_sk(sk)->tos = tos;
        ipc.addr = iph->saddr;
-        ipc.opt = &icmp_param.replyopts;
+        ipc.opt = &icmp_param.replyopts.opt;
-        ipc.shtx.flags = 0;
+        ipc.tx_flags = 0;
-        {
-                struct flowi fl = {
-                        .nl_u = {
-                                .ip4_u = {
-                                        .daddr = icmp_param.replyopts.srr ?
-                                                icmp_param.replyopts.faddr :
-                                                iph->saddr,
-                                        .saddr = saddr,
-                                        .tos = RT_TOS(tos)
-                                }
-                        },
-                        .proto = IPPROTO_ICMP,
-                        .uli_u = {
-                                .icmpt = {
-                                        .type = type,
-                                        .code = code
-                                }
-                        }
-                };
-                int err;
-                struct rtable *rt2;
-                security_skb_classify_flow(skb_in, &fl);
-                if (__ip_route_output_key(net, &rt, &fl))
-                        goto out_unlock;
-                /* No need to clone since we're just using its address. */
-                rt2 = rt;
-                err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
-                switch (err) {
-                case 0:
-                        if (rt != rt2)
-                                goto route_done;
-                        break;
-                case -EPERM:
-                        rt = NULL;
-                        break;
-                default:
-                        goto out_unlock;
-                }
-                if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
-                        goto relookup_failed;
-                if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
-                        err = __ip_route_output_key(net, &rt2, &fl);
-                else {
-                        struct flowi fl2 = {};
-                        unsigned long orefdst;
-                        fl2.fl4_dst = fl.fl4_src;
+        rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
-                        if (ip_route_output_key(net, &rt2, &fl2))
+                               type, code, &icmp_param);
-                                goto relookup_failed;
+        if (IS_ERR(rt))
+                goto out_unlock;
-                        /* Ugh! */
-                        orefdst = skb_in->_skb_refdst; /* save old refdst */
-                        err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
-                                             RT_TOS(tos), rt2->dst.dev);
-                        dst_release(&rt2->dst);
-                        rt2 = skb_rtable(skb_in);
-                        skb_in->_skb_refdst = orefdst; /* restore old refdst */
-                }
-                if (err)
-                        goto relookup_failed;
-                err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
-                                  XFRM_LOOKUP_ICMP);
-                switch (err) {
-                case 0:
-                        dst_release(&rt->dst);
-                        rt = rt2;
-                        break;
-                case -EPERM:
-                        goto ende;
-                default:
-relookup_failed:
-                        if (!rt)
-                                goto out_unlock;
-                        break;
-                }
-        }
-route_done:
+        if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
-        if (!icmpv4_xrlim_allow(net, rt, type, code))
                goto ende;
        /* RFC says return as much as we can without exceeding 576 bytes. */
@@ -634,7 +611,7 @@ route_done:
        room = dst_mtu(&rt->dst);
        if (room > 576)
                room = 576;
-        room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+        room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
        room -= sizeof(struct icmphdr);
        icmp_param.data_len = skb_in->len - icmp_param.offset;
@@ -642,7 +619,7 @@ route_done:
                icmp_param.data_len = room;
        icmp_param.head_len = sizeof(struct icmphdr);
-        icmp_push_reply(&icmp_param, &ipc, &rt);
+        icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
 ende:
        ip_rt_put(rt);
 out_unlock:
@@ -658,7 +635,7 @@ EXPORT_SYMBOL(icmp_send);
 static void icmp_unreach(struct sk_buff *skb)
 {
-        struct iphdr *iph;
+        const struct iphdr *iph;
        struct icmphdr *icmph;
        int hash, protocol;
        const struct net_protocol *ipprot;
@@ -677,7 +654,7 @@ static void icmp_unreach(struct sk_buff *skb)
                goto out_err;
        icmph = icmp_hdr(skb);
-        iph   = (struct iphdr *)skb->data;
+        iph   = (const struct iphdr *)skb->data;
        if (iph->ihl < 5) /* Mangled header, drop. */
                goto out_err;
@@ -725,7 +702,7 @@ static void icmp_unreach(struct sk_buff *skb)
         */
        /*
-         *      Check the other end isnt violating RFC 1122. Some routers send
+         *      Check the other end isn't violating RFC 1122. Some routers send
         *      bogus responses to broadcast frames. If you see this message
         *      first check your netmask matches at both ends, if it does then
         *      get the other vendor to fix their kit.
@@ -750,7 +727,7 @@ static void icmp_unreach(struct sk_buff *skb)
        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
                goto out;
-        iph = (struct iphdr *)skb->data;
+        iph = (const struct iphdr *)skb->data;
        protocol = iph->protocol;
        /*
@@ -779,7 +756,7 @@ out_err:
 static void icmp_redirect(struct sk_buff *skb)
 {
-        struct iphdr *iph;
+        const struct iphdr *iph;
        if (skb->len < sizeof(struct iphdr))
                goto out_err;
@@ -790,7 +767,7 @@ static void icmp_redirect(struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto out;
-        iph = (struct iphdr *)skb->data;
+        iph = (const struct iphdr *)skb->data;
        switch (icmp_hdr(skb)->code & 7) {
        case ICMP_REDIR_NET:
@@ -805,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb)
                               iph->saddr, skb->dev);
                break;
        }
+        /* Ping wants to see redirects.
+         * Let's pretend they are errors of sorts... */
+        if (iph->protocol == IPPROTO_ICMP &&
+            iph->ihl >= 5 &&
+            pskb_may_pull(skb, (iph->ihl<<2)+8)) {
+                ping_err(skb, icmp_hdr(skb)->un.gateway);
+        }
 out:
        return;
 out_err:
@@ -954,12 +940,12 @@ static void icmp_address_reply(struct sk_buff *skb)
                BUG_ON(mp == NULL);
                for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
                        if (*mp == ifa->ifa_mask &&
-                            inet_ifa_match(rt->rt_src, ifa))
+                            inet_ifa_match(ip_hdr(skb)->saddr, ifa))
                                break;
                }
                if (!ifa && net_ratelimit()) {
                        printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
-                               mp, dev->name, &rt->rt_src);
+                               mp, dev->name, &ip_hdr(skb)->saddr);
                }
        }
 }
@@ -1065,7 +1051,7 @@ error:
 */
 static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
        [ICMP_ECHOREPLY] = {
-                .handler = icmp_discard,
+                .handler = ping_rcv,
        },
        [1] = {
                .handler = icmp_discard,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2a4bb76f2132..f1d27f6c9351 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -153,17 +153,27 @@ static void ip_ma_put(struct ip_mc_list *im)
 {
        if (atomic_dec_and_test(&im->refcnt)) {
                in_dev_put(im->interface);
-                kfree(im);
+                kfree_rcu(im, rcu);
        }
 }
+#define for_each_pmc_rcu(in_dev, pmc)                           \
+        for (pmc = rcu_dereference(in_dev->mc_list);            \
+             pmc != NULL;                                       \
+             pmc = rcu_dereference(pmc->next_rcu))
+#define for_each_pmc_rtnl(in_dev, pmc)                          \
+        for (pmc = rtnl_dereference(in_dev->mc_list);           \
+             pmc != NULL;                                       \
+             pmc = rtnl_dereference(pmc->next_rcu))
 #ifdef CONFIG_IP_MULTICAST
 /*
 *      Timer management
 */
-static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+static void igmp_stop_timer(struct ip_mc_list *im)
 {
        spin_lock_bh(&im->lock);
        if (del_timer(&im->timer))
@@ -284,6 +294,8 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
        return scount;
 }
+#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
 static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 {
        struct sk_buff *skb;
@@ -291,24 +303,24 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
        struct iphdr *pip;
        struct igmpv3_report *pig;
        struct net *net = dev_net(dev);
+        struct flowi4 fl4;
-        skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
+        while (1) {
-        if (skb == NULL)
+                skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
-                return NULL;
+                                GFP_ATOMIC | __GFP_NOWARN);
+                if (skb)
-        {
+                        break;
-                struct flowi fl = { .oif = dev->ifindex,
+                size >>= 1;
-                                    .nl_u = { .ip4_u = {
+                if (size < 256)
-                                    .daddr = IGMPV3_ALL_MCR } },
-                                    .proto = IPPROTO_IGMP };
-                if (ip_route_output_key(net, &rt, &fl)) {
-                        kfree_skb(skb);
                        return NULL;
-                }
        }
-        if (rt->rt_src == 0) {
+        igmp_skb_size(skb) = size;
+        rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+                                   0, 0,
+                                   IPPROTO_IGMP, 0, dev->ifindex);
+        if (IS_ERR(rt)) {
                kfree_skb(skb);
-                ip_rt_put(rt);
                return NULL;
        }
@@ -326,8 +338,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
        pip->tos      = 0xc0;
        pip->frag_off = htons(IP_DF);
        pip->ttl      = 1;
-        pip->daddr    = rt->rt_dst;
+        pip->daddr    = fl4.daddr;
-        pip->saddr    = rt->rt_src;
+        pip->saddr    = fl4.saddr;
        pip->protocol = IPPROTO_IGMP;
        pip->tot_len  = 0;      /* filled in later */
        ip_select_ident(pip, &rt->dst, NULL);
@@ -384,7 +396,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
        return skb;
 }
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
        skb_tailroom(skb)) : 0)
 static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -502,8 +514,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
        int type;
        if (!pmc) {
-                read_lock(&in_dev->mc_list_lock);
+                rcu_read_lock();
-                for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+                for_each_pmc_rcu(in_dev, pmc) {
                        if (pmc->multiaddr == IGMP_ALL_HOSTS)
                                continue;
                        spin_lock_bh(&pmc->lock);
@@ -514,7 +526,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
                        skb = add_grec(skb, pmc, type, 0, 0);
                        spin_unlock_bh(&pmc->lock);
                }
-                read_unlock(&in_dev->mc_list_lock);
+                rcu_read_unlock();
        } else {
                spin_lock_bh(&pmc->lock);
                if (pmc->sfcount[MCAST_EXCLUDE])
@@ -556,7 +568,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
        struct sk_buff *skb = NULL;
        int type, dtype;
-        read_lock(&in_dev->mc_list_lock);
+        rcu_read_lock();
        spin_lock_bh(&in_dev->mc_tomb_lock);
        /* deleted MCA's */
@@ -593,7 +605,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
        spin_unlock_bh(&in_dev->mc_tomb_lock);
        /* change recs */
-        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rcu(in_dev, pmc) {
                spin_lock_bh(&pmc->lock);
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -616,7 +628,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
                }
                spin_unlock_bh(&pmc->lock);
        }
-        read_unlock(&in_dev->mc_list_lock);
+        rcu_read_unlock();
        if (!skb)
                return;
@@ -633,6 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        struct net_device *dev = in_dev->dev;
        struct net *net = dev_net(dev);
        __be32  group = pmc ? pmc->multiaddr : 0;
+        struct flowi4 fl4;
        __be32  dst;
        if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
@@ -642,17 +655,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        else
                dst = group;
-        {
+        rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
-                struct flowi fl = { .oif = dev->ifindex,
+                                   0, 0,
-                                    .nl_u = { .ip4_u = { .daddr = dst } },
+                                   IPPROTO_IGMP, 0, dev->ifindex);
-                                    .proto = IPPROTO_IGMP };
+        if (IS_ERR(rt))
-                if (ip_route_output_key(net, &rt, &fl))
-                        return -1;
-        }
-        if (rt->rt_src == 0) {
-                ip_rt_put(rt);
                return -1;
-        }
        skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
        if (skb == NULL) {
@@ -674,7 +681,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        iph->frag_off = htons(IP_DF);
        iph->ttl      = 1;
        iph->daddr    = dst;
-        iph->saddr    = rt->rt_src;
+        iph->saddr    = fl4.saddr;
        iph->protocol = IPPROTO_IGMP;
        ip_select_ident(iph, &rt->dst, NULL);
        ((u8*)&iph[1])[0] = IPOPT_RA;
@@ -813,14 +820,14 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
        if (group == IGMP_ALL_HOSTS)
                return;
-        read_lock(&in_dev->mc_list_lock);
+        rcu_read_lock();
-        for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+        for_each_pmc_rcu(in_dev, im) {
                if (im->multiaddr == group) {
                        igmp_stop_timer(im);
                        break;
                }
        }
-        read_unlock(&in_dev->mc_list_lock);
+        rcu_read_unlock();
 }
 static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
@@ -906,8 +913,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
         * - Use the igmp->igmp_code field as the maximum
         *   delay possible
         */
-        read_lock(&in_dev->mc_list_lock);
+        rcu_read_lock();
-        for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+        for_each_pmc_rcu(in_dev, im) {
                int changed;
                if (group && group != im->multiaddr)
@@ -925,7 +932,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                if (changed)
                        igmp_mod_timer(im, max_delay);
        }
-        read_unlock(&in_dev->mc_list_lock);
+        rcu_read_unlock();
 }
 /* called in rcu_read_lock() section */
@@ -961,7 +968,7 @@ int igmp_rcv(struct sk_buff *skb)
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
                /* Is it our report looped back? */
-                if (skb_rtable(skb)->fl.iif == 0)
+                if (rt_is_output_route(skb_rtable(skb)))
                        break;
                /* don't rely on MC router hearing unicast reports */
                if (skb->pkt_type == PACKET_MULTICAST ||
@@ -1110,8 +1117,8 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
                kfree(pmc);
        }
        /* clear dead sources, too */
-        read_lock(&in_dev->mc_list_lock);
+        rcu_read_lock();
-        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rcu(in_dev, pmc) {
                struct ip_sf_list *psf, *psf_next;
                spin_lock_bh(&pmc->lock);
@@ -1123,7 +1130,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
                        kfree(psf);
                }
        }
-        read_unlock(&in_dev->mc_list_lock);
+        rcu_read_unlock();
 }
 #endif
@@ -1148,20 +1155,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)
        if (!in_dev->dead) {
                if (IGMP_V1_SEEN(in_dev))
-                        goto done;
+                        return;
                if (IGMP_V2_SEEN(in_dev)) {
                        if (reporter)
                                igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
-                        goto done;
+                        return;
                }
                /* IGMPv3 */
                igmpv3_add_delrec(in_dev, im);
                igmp_ifc_event(in_dev);
        }
-done:
 #endif
-        ip_mc_clear_src(im);
 }
 static void igmp_group_added(struct ip_mc_list *im)
@@ -1209,7 +1214,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
        ASSERT_RTNL();
-        for (im=in_dev->mc_list; im; im=im->next) {
+        for_each_pmc_rtnl(in_dev, im) {
                if (im->multiaddr == addr) {
                        im->users++;
                        ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
@@ -1217,7 +1222,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
                }
        }
-        im = kmalloc(sizeof(*im), GFP_KERNEL);
+        im = kzalloc(sizeof(*im), GFP_KERNEL);
        if (!im)
                goto out;
@@ -1227,26 +1232,18 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
        im->multiaddr = addr;
        /* initial mode is (EX, empty) */
        im->sfmode = MCAST_EXCLUDE;
-        im->sfcount[MCAST_INCLUDE] = 0;
        im->sfcount[MCAST_EXCLUDE] = 1;
-        im->sources = NULL;
-        im->tomb = NULL;
-        im->crcount = 0;
        atomic_set(&im->refcnt, 1);
        spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
-        im->tm_running = 0;
        setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
        im->unsolicit_count = IGMP_Unsolicited_Report_Count;
-        im->reporter = 0;
-        im->gsquery = 0;
 #endif
-        im->loaded = 0;
-        write_lock_bh(&in_dev->mc_list_lock);
+        im->next_rcu = in_dev->mc_list;
-        im->next = in_dev->mc_list;
-        in_dev->mc_list = im;
        in_dev->mc_count++;
-        write_unlock_bh(&in_dev->mc_list_lock);
+        rcu_assign_pointer(in_dev->mc_list, im);
 #ifdef CONFIG_IP_MULTICAST
        igmpv3_del_delrec(in_dev, im->multiaddr);
 #endif
@@ -1260,26 +1257,32 @@ EXPORT_SYMBOL(ip_mc_inc_group);
 /*
 *      Resend IGMP JOIN report; used for bonding.
+ *      Called with rcu_read_lock()
 */
-void ip_mc_rejoin_group(struct ip_mc_list *im)
+void ip_mc_rejoin_groups(struct in_device *in_dev)
 {
 #ifdef CONFIG_IP_MULTICAST
-        struct in_device *in_dev = im->interface;
+        struct ip_mc_list *im;
+        int type;
-        if (im->multiaddr == IGMP_ALL_HOSTS)
+        for_each_pmc_rcu(in_dev, im) {
-                return;
+                if (im->multiaddr == IGMP_ALL_HOSTS)
+                        continue;
-        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+                /* a failover is happening and switches
-                igmp_mod_timer(im, IGMP_Initial_Report_Delay);
+                 * must be notified immediately
-                return;
+                 */
+                if (IGMP_V1_SEEN(in_dev))
+                        type = IGMP_HOST_MEMBERSHIP_REPORT;
+                else if (IGMP_V2_SEEN(in_dev))
+                        type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+                else
+                        type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+                igmp_send_report(in_dev, im, type);
        }
-        /* else, v3 */
-        im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
-                IGMP_Unsolicited_Report_Count;
-        igmp_ifc_event(in_dev);
 #endif
 }
-EXPORT_SYMBOL(ip_mc_rejoin_group);
+EXPORT_SYMBOL(ip_mc_rejoin_groups);
 /*
 *      A socket has left a multicast group on device dev
@@ -1287,18 +1290,20 @@ EXPORT_SYMBOL(ip_mc_rejoin_group);
 void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
 {
-        struct ip_mc_list *i, **ip;
+        struct ip_mc_list *i;
+        struct ip_mc_list __rcu **ip;
        ASSERT_RTNL();
-        for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
+        for (ip = &in_dev->mc_list;
+             (i = rtnl_dereference(*ip)) != NULL;
+             ip = &i->next_rcu) {
                if (i->multiaddr == addr) {
                        if (--i->users == 0) {
-                                write_lock_bh(&in_dev->mc_list_lock);
+                                *ip = i->next_rcu;
-                                *ip = i->next;
                                in_dev->mc_count--;
-                                write_unlock_bh(&in_dev->mc_list_lock);
                                igmp_group_dropped(i);
+                                ip_mc_clear_src(i);
                                if (!in_dev->dead)
                                        ip_rt_multicast_event(in_dev);
@@ -1316,34 +1321,34 @@ EXPORT_SYMBOL(ip_mc_dec_group);
 void ip_mc_unmap(struct in_device *in_dev)
 {
-        struct ip_mc_list *i;
+        struct ip_mc_list *pmc;
        ASSERT_RTNL();
-        for (i = in_dev->mc_list; i; i = i->next)
+        for_each_pmc_rtnl(in_dev, pmc)
-                igmp_group_dropped(i);
+                igmp_group_dropped(pmc);
 }
 void ip_mc_remap(struct in_device *in_dev)
 {
-        struct ip_mc_list *i;
+        struct ip_mc_list *pmc;
        ASSERT_RTNL();
-        for (i = in_dev->mc_list; i; i = i->next)
+        for_each_pmc_rtnl(in_dev, pmc)
-                igmp_group_added(i);
+                igmp_group_added(pmc);
 }
 /* Device going down */
 void ip_mc_down(struct in_device *in_dev)
 {
-        struct ip_mc_list *i;
+        struct ip_mc_list *pmc;
        ASSERT_RTNL();
-        for (i=in_dev->mc_list; i; i=i->next)
+        for_each_pmc_rtnl(in_dev, pmc)
-                igmp_group_dropped(i);
+                igmp_group_dropped(pmc);
 #ifdef CONFIG_IP_MULTICAST
        in_dev->mr_ifc_count = 0;
@@ -1374,7 +1379,6 @@ void ip_mc_init_dev(struct in_device *in_dev)
        in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
 #endif
-        rwlock_init(&in_dev->mc_list_lock);
        spin_lock_init(&in_dev->mc_tomb_lock);
 }
@@ -1382,14 +1386,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
 void ip_mc_up(struct in_device *in_dev)
 {
-        struct ip_mc_list *i;
+        struct ip_mc_list *pmc;
        ASSERT_RTNL();
        ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
-        for (i=in_dev->mc_list; i; i=i->next)
+        for_each_pmc_rtnl(in_dev, pmc)
-                igmp_group_added(i);
+                igmp_group_added(pmc);
 }
 /*
@@ -1405,43 +1409,40 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
        /* Deactivate timers */
        ip_mc_down(in_dev);
-        write_lock_bh(&in_dev->mc_list_lock);
+        while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
-        while ((i = in_dev->mc_list) != NULL) {
+                in_dev->mc_list = i->next_rcu;
-                in_dev->mc_list = i->next;
                in_dev->mc_count--;
-                write_unlock_bh(&in_dev->mc_list_lock);
-                igmp_group_dropped(i);
-                ip_ma_put(i);
-                write_lock_bh(&in_dev->mc_list_lock);
+                /* We've dropped the groups in ip_mc_down already */
+                ip_mc_clear_src(i);
+                ip_ma_put(i);
        }
-        write_unlock_bh(&in_dev->mc_list_lock);
 }
+/* RTNL is locked */
 static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 {
-        struct flowi fl = { .nl_u = { .ip4_u =
-                                      { .daddr = imr->imr_multiaddr.s_addr } } };
-        struct rtable *rt;
        struct net_device *dev = NULL;
        struct in_device *idev = NULL;
        if (imr->imr_ifindex) {
                idev = inetdev_by_index(net, imr->imr_ifindex);
-                if (idev)
-                        __in_dev_put(idev);
                return idev;
        }
        if (imr->imr_address.s_addr) {
-                dev = ip_dev_find(net, imr->imr_address.s_addr);
+                dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
                if (!dev)
                        return NULL;
-                dev_put(dev);
        }
-        if (!dev && !ip_route_output_key(net, &rt, &fl)) {
+        if (!dev) {
-                dev = rt->dst.dev;
+                struct rtable *rt = ip_route_output(net,
-                ip_rt_put(rt);
+                                                    imr->imr_multiaddr.s_addr,
+                                                    0, 0, 0);
+                if (!IS_ERR(rt)) {
+                        dev = rt->dst.dev;
+                        ip_rt_put(rt);
+                }
        }
        if (dev) {
                imr->imr_ifindex = dev->ifindex;
@@ -1515,18 +1516,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
        if (!in_dev)
                return -ENODEV;
-        read_lock(&in_dev->mc_list_lock);
+        rcu_read_lock();
-        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rcu(in_dev, pmc) {
                if (*pmca == pmc->multiaddr)
                        break;
        }
        if (!pmc) {
                /* MCA not found?? bug */
-                read_unlock(&in_dev->mc_list_lock);
+                rcu_read_unlock();
                return -ESRCH;
        }
        spin_lock_bh(&pmc->lock);
-        read_unlock(&in_dev->mc_list_lock);
+        rcu_read_unlock();
 #ifdef CONFIG_IP_MULTICAST
        sf_markstate(pmc);
 #endif
@@ -1687,18 +1688,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
        if (!in_dev)
                return -ENODEV;
-        read_lock(&in_dev->mc_list_lock);
+        rcu_read_lock();
-        for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rcu(in_dev, pmc) {
                if (*pmca == pmc->multiaddr)
                        break;
        }
        if (!pmc) {
                /* MCA not found?? bug */
-                read_unlock(&in_dev->mc_list_lock);
+                rcu_read_unlock();
                return -ESRCH;
        }
        spin_lock_bh(&pmc->lock);
-        read_unlock(&in_dev->mc_list_lock);
+        rcu_read_unlock();
 #ifdef CONFIG_IP_MULTICAST
        sf_markstate(pmc);
@@ -1795,7 +1796,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
        err = -EADDRINUSE;
        ifindex = imr->imr_ifindex;
-        for (i = inet->mc_list; i; i = i->next) {
+        for_each_pmc_rtnl(inet, i) {
                if (i->multi.imr_multiaddr.s_addr == addr &&
                    i->multi.imr_ifindex == ifindex)
                        goto done;
@@ -1809,7 +1810,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
                goto done;
        memcpy(&iml->multi, imr, sizeof(*imr));
-        iml->next = inet->mc_list;
+        iml->next_rcu = inet->mc_list;
        iml->sflist = NULL;
        iml->sfmode = MCAST_EXCLUDE;
        rcu_assign_pointer(inet->mc_list, iml);
@@ -1821,19 +1822,10 @@ done:
 }
 EXPORT_SYMBOL(ip_mc_join_group);
-static void ip_sf_socklist_reclaim(struct rcu_head *rp)
-{
-        struct ip_sf_socklist *psf;
-        psf = container_of(rp, struct ip_sf_socklist, rcu);
-        /* sk_omem_alloc should have been decreased by the caller*/
-        kfree(psf);
-}
 static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
                           struct in_device *in_dev)
 {
-        struct ip_sf_socklist *psf = iml->sflist;
+        struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
        int err;
        if (psf == NULL) {
@@ -1846,21 +1838,10 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
        rcu_assign_pointer(iml->sflist, NULL);
        /* decrease mem now to avoid the memleak warning */
        atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
-        call_rcu(&psf->rcu, ip_sf_socklist_reclaim);
+        kfree_rcu(psf, rcu);
        return err;
 }
-static void ip_mc_socklist_reclaim(struct rcu_head *rp)
-{
-        struct ip_mc_socklist *iml;
-        iml = container_of(rp, struct ip_mc_socklist, rcu);
-        /* sk_omem_alloc should have been decreased by the caller*/
-        kfree(iml);
-}
 /*
 *      Ask a socket to leave a group.
 */
@@ -1868,7 +1849,8 @@ static void ip_mc_socklist_reclaim(struct rcu_head *rp)
 int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 {
        struct inet_sock *inet = inet_sk(sk);
-        struct ip_mc_socklist *iml, **imlp;
+        struct ip_mc_socklist *iml;
+        struct ip_mc_socklist __rcu **imlp;
        struct in_device *in_dev;
        struct net *net = sock_net(sk);
        __be32 group = imr->imr_multiaddr.s_addr;
@@ -1878,7 +1860,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
        rtnl_lock();
        in_dev = ip_mc_find_dev(net, imr);
        ifindex = imr->imr_ifindex;
-        for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+        for (imlp = &inet->mc_list;
+             (iml = rtnl_dereference(*imlp)) != NULL;
+             imlp = &iml->next_rcu) {
                if (iml->multi.imr_multiaddr.s_addr != group)
                        continue;
                if (ifindex) {
@@ -1890,14 +1874,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
                (void) ip_mc_leave_src(sk, iml, in_dev);
-                rcu_assign_pointer(*imlp, iml->next);
+                *imlp = iml->next_rcu;
                if (in_dev)
                        ip_mc_dec_group(in_dev, group);
                rtnl_unlock();
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
-                call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
+                kfree_rcu(iml, rcu);
                return 0;
        }
        if (!in_dev)
@@ -1936,7 +1920,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
        }
        err = -EADDRNOTAVAIL;
-        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rtnl(inet, pmc) {
                if ((pmc->multi.imr_multiaddr.s_addr ==
                     imr.imr_multiaddr.s_addr) &&
                    (pmc->multi.imr_ifindex == imr.imr_ifindex))
@@ -1960,7 +1944,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
                pmc->sfmode = omode;
        }
-        psl = pmc->sflist;
+        psl = rtnl_dereference(pmc->sflist);
        if (!add) {
                if (!psl)
                        goto done;      /* err = -EADDRNOTAVAIL */
@@ -2014,7 +1998,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
                                newpsl->sl_addr[i] = psl->sl_addr[i];
                        /* decrease mem now to avoid the memleak warning */
                        atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
-                        call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
+                        kfree_rcu(psl, rcu);
                }
                rcu_assign_pointer(pmc->sflist, newpsl);
                psl = newpsl;
@@ -2079,7 +2063,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
                goto done;
        }
-        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
@@ -2109,13 +2093,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
                (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
                                     msf->imsf_fmode, 0, NULL, 0);
        }
-        psl = pmc->sflist;
+        psl = rtnl_dereference(pmc->sflist);
        if (psl) {
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
                        psl->sl_count, psl->sl_addr, 0);
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
-                call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
+                kfree_rcu(psl, rcu);
        } else
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
                        0, NULL, 0);
@@ -2157,7 +2141,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
        }
        err = -EADDRNOTAVAIL;
-        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
@@ -2165,7 +2149,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
        if (!pmc)               /* must have a prior join */
                goto done;
        msf->imsf_fmode = pmc->sfmode;
-        psl = pmc->sflist;
+        psl = rtnl_dereference(pmc->sflist);
        rtnl_unlock();
        if (!psl) {
                len = 0;
@@ -2210,7 +2194,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
        err = -EADDRNOTAVAIL;
-        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == addr &&
                    pmc->multi.imr_ifindex == gsf->gf_interface)
                        break;
@@ -2218,7 +2202,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
        if (!pmc)               /* must have a prior join */
                goto done;
        gsf->gf_fmode = pmc->sfmode;
-        psl = pmc->sflist;
+        psl = rtnl_dereference(pmc->sflist);
        rtnl_unlock();
        count = psl ? psl->sl_count : 0;
        copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
@@ -2259,7 +2243,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
                goto out;
        rcu_read_lock();
-        for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) {
+        for_each_pmc_rcu(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
                    pmc->multi.imr_ifindex == dif)
                        break;
@@ -2267,7 +2251,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
        ret = inet->mc_all;
        if (!pmc)
                goto unlock;
-        psl = pmc->sflist;
+        psl = rcu_dereference(pmc->sflist);
        ret = (pmc->sfmode == MCAST_EXCLUDE);
        if (!psl)
                goto unlock;
@@ -2302,31 +2286,29 @@ void ip_mc_drop_socket(struct sock *sk)
                return;
        rtnl_lock();
-        while ((iml = inet->mc_list) != NULL) {
+        while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
                struct in_device *in_dev;
-                rcu_assign_pointer(inet->mc_list, iml->next);
+                inet->mc_list = iml->next_rcu;
                in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
                (void) ip_mc_leave_src(sk, iml, in_dev);
-                if (in_dev != NULL) {
+                if (in_dev != NULL)
                        ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
-                        in_dev_put(in_dev);
-                }
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
-                call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
+                kfree_rcu(iml, rcu);
        }
        rtnl_unlock();
 }
-int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
 {
        struct ip_mc_list *im;
        struct ip_sf_list *psf;
        int rv = 0;
-        read_lock(&in_dev->mc_list_lock);
+        for_each_pmc_rcu(in_dev, im) {
-        for (im=in_dev->mc_list; im; im=im->next) {
                if (im->multiaddr == mc_addr)
                        break;
        }
@@ -2347,7 +2329,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
                } else
                        rv = 1; /* unspecified source; tentatively allow */
        }
-        read_unlock(&in_dev->mc_list_lock);
        return rv;
 }
@@ -2373,13 +2354,11 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
                in_dev = __in_dev_get_rcu(state->dev);
                if (!in_dev)
                        continue;
-                read_lock(&in_dev->mc_list_lock);
+                im = rcu_dereference(in_dev->mc_list);
-                im = in_dev->mc_list;
                if (im) {
                        state->in_dev = in_dev;
                        break;
                }
-                read_unlock(&in_dev->mc_list_lock);
        }
        return im;
 }
@@ -2387,11 +2366,9 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
 static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
 {
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
-        im = im->next;
-        while (!im) {
-                if (likely(state->in_dev != NULL))
-                        read_unlock(&state->in_dev->mc_list_lock);
+        im = rcu_dereference(im->next_rcu);
+        while (!im) {
                state->dev = next_net_device_rcu(state->dev);
                if (!state->dev) {
                        state->in_dev = NULL;
@@ -2400,8 +2377,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
                state->in_dev = __in_dev_get_rcu(state->dev);
                if (!state->in_dev)
                        continue;
-                read_lock(&state->in_dev->mc_list_lock);
+                im = rcu_dereference(state->in_dev->mc_list);
-                im = state->in_dev->mc_list;
        }
        return im;
 }
@@ -2437,10 +2413,8 @@ static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
 {
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
-        if (likely(state->in_dev != NULL)) {
-                read_unlock(&state->in_dev->mc_list_lock);
+        state->in_dev = NULL;
-                state->in_dev = NULL;
-        }
        state->dev = NULL;
        rcu_read_unlock();
 }
@@ -2462,7 +2436,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
                querier = "NONE";
 #endif
-                if (state->in_dev->mc_list == im) {
+                if (rcu_dereference(state->in_dev->mc_list) == im) {
                        seq_printf(seq, "%d\t%-10s: %5d %7s\n",
                                   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
                }
@@ -2521,8 +2495,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
                idev = __in_dev_get_rcu(state->dev);
                if (unlikely(idev == NULL))
                        continue;
-                read_lock(&idev->mc_list_lock);
+                im = rcu_dereference(idev->mc_list);
-                im = idev->mc_list;
                if (likely(im != NULL)) {
                        spin_lock_bh(&im->lock);
                        psf = im->sources;
@@ -2533,7 +2506,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
                        }
                        spin_unlock_bh(&im->lock);
                }
-                read_unlock(&idev->mc_list_lock);
        }
        return psf;
 }
@@ -2547,9 +2519,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
                spin_unlock_bh(&state->im->lock);
                state->im = state->im->next;
                while (!state->im) {
-                        if (likely(state->idev != NULL))
-                                read_unlock(&state->idev->mc_list_lock);
                        state->dev = next_net_device_rcu(state->dev);
                        if (!state->dev) {
                                state->idev = NULL;
@@ -2558,8 +2527,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
                        state->idev = __in_dev_get_rcu(state->dev);
                        if (!state->idev)
                                continue;
-                        read_lock(&state->idev->mc_list_lock);
+                        state->im = rcu_dereference(state->idev->mc_list);
-                        state->im = state->idev->mc_list;
                }
                if (!state->im)
                        break;
@@ -2605,10 +2573,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
                spin_unlock_bh(&state->im->lock);
                state->im = NULL;
        }
-        if (likely(state->idev != NULL)) {
+        state->idev = NULL;
-                read_unlock(&state->idev->mc_list_lock);
-                state->idev = NULL;
-        }
        state->dev = NULL;
        rcu_read_unlock();
 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7174370b1195..c14d88ad348d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
 * This struct holds the first and last local port number.
 */
 struct local_ports sysctl_local_ports __read_mostly = {
-        .lock = SEQLOCK_UNLOCKED,
+        .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
        .range = { 32768, 61000 },
 };
@@ -55,7 +55,6 @@ EXPORT_SYMBOL(inet_get_local_port_range);
 int inet_csk_bind_conflict(const struct sock *sk,
                           const struct inet_bind_bucket *tb)
 {
-        const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
        struct sock *sk2;
        struct hlist_node *node;
        int reuse = sk->sk_reuse;
@@ -75,9 +74,9 @@ int inet_csk_bind_conflict(const struct sock *sk,
                     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
                        if (!reuse || !sk2->sk_reuse ||
                            sk2->sk_state == TCP_LISTEN) {
-                                const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+                                const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
-                                if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+                                if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
-                                    sk2_rcv_saddr == sk_rcv_saddr)
+                                    sk2_rcv_saddr == sk_rcv_saddr(sk))
                                        break;
                        }
                }
@@ -351,30 +350,24 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 struct dst_entry *inet_csk_route_req(struct sock *sk,
+                                     struct flowi4 *fl4,
                                     const struct request_sock *req)
 {
        struct rtable *rt;
        const struct inet_request_sock *ireq = inet_rsk(req);
-        struct ip_options *opt = inet_rsk(req)->opt;
+        struct ip_options_rcu *opt = inet_rsk(req)->opt;
-        struct flowi fl = { .oif = sk->sk_bound_dev_if,
-                            .mark = sk->sk_mark,
-                            .nl_u = { .ip4_u =
-                                      { .daddr = ((opt && opt->srr) ?
-                                                  opt->faddr :
-                                                  ireq->rmt_addr),
-                                        .saddr = ireq->loc_addr,
-                                        .tos = RT_CONN_FLAGS(sk) } },
-                            .proto = sk->sk_protocol,
-                            .flags = inet_sk_flowi_flags(sk),
-                            .uli_u = { .ports =
-                                       { .sport = inet_sk(sk)->inet_sport,
-                                         .dport = ireq->rmt_port } } };
        struct net *net = sock_net(sk);
-        security_req_classify_flow(req, &fl);
+        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
-        if (ip_route_output_flow(net, &rt, &fl, sk, 0))
+                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                           sk->sk_protocol, inet_sk_flowi_flags(sk),
+                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
+                           ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+        security_req_classify_flow(req, flowi4_to_flowi(fl4));
+        rt = ip_route_output_flow(net, fl4, sk);
+        if (IS_ERR(rt))
                goto no_route;
-        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+        if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
                goto route_err;
        return &rt->dst;
@@ -386,6 +379,39 @@ no_route:
 }
 EXPORT_SYMBOL_GPL(inet_csk_route_req);
+struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+                                            struct sock *newsk,
+                                            const struct request_sock *req)
+{
+        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct inet_sock *newinet = inet_sk(newsk);
+        struct ip_options_rcu *opt = ireq->opt;
+        struct net *net = sock_net(sk);
+        struct flowi4 *fl4;
+        struct rtable *rt;
+        fl4 = &newinet->cork.fl.u.ip4;
+        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                           sk->sk_protocol, inet_sk_flowi_flags(sk),
+                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
+                           ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+        security_req_classify_flow(req, flowi4_to_flowi(fl4));
+        rt = ip_route_output_flow(net, fl4, sk);
+        if (IS_ERR(rt))
+                goto no_route;
+        if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+                goto route_err;
+        return &rt->dst;
+route_err:
+        ip_rt_put(rt);
+no_route:
+        IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
 static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
                                 const u32 rnd, const u32 synq_hsize)
 {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..3267d3898437 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -124,7 +124,7 @@ static int inet_csk_diag_fill(struct sock *sk,
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
        if (r->idiag_family == AF_INET6) {
-                struct ipv6_pinfo *np = inet6_sk(sk);
+                const struct ipv6_pinfo *np = inet6_sk(sk);
                ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
                               &np->rcv_saddr);
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
                        bc += op->no;
                }
        }
-        return (len == 0);
+        return len == 0;
 }
 static int valid_cc(const void *bc, int len, int cc)
@@ -437,7 +437,7 @@ static int valid_cc(const void *bc, int len, int cc)
                        return 0;
                if (cc == len)
                        return 1;
-                if (op->yes < 4)
+                if (op->yes < 4 || op->yes & 3)
                        return 0;
                len -= op->yes;
                bc  += op->yes;
@@ -447,11 +447,11 @@ static int valid_cc(const void *bc, int len, int cc)
 static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
 {
-        const unsigned char *bc = bytecode;
+        const void *bc = bytecode;
        int  len = bytecode_len;
        while (len > 0) {
-                struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+                const struct inet_diag_bc_op *op = bc;
 //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
                switch (op->code) {
@@ -462,22 +462,20 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
                case INET_DIAG_BC_S_LE:
                case INET_DIAG_BC_D_GE:
                case INET_DIAG_BC_D_LE:
-                        if (op->yes < 4 || op->yes > len + 4)
-                                return -EINVAL;
                case INET_DIAG_BC_JMP:
-                        if (op->no < 4 || op->no > len + 4)
+                        if (op->no < 4 || op->no > len + 4 || op->no & 3)
                                return -EINVAL;
                        if (op->no < len &&
                            !valid_cc(bytecode, bytecode_len, len - op->no))
                                return -EINVAL;
                        break;
                case INET_DIAG_BC_NOP:
-                        if (op->yes < 4 || op->yes > len + 4)
-                                return -EINVAL;
                        break;
                default:
                        return -EINVAL;
                }
+                if (op->yes < 4 || op->yes > len + 4 || op->yes & 3)
+                        return -EINVAL;
                bc  += op->yes;
                len -= op->yes;
        }
@@ -490,9 +488,11 @@ static int inet_csk_diag_dump(struct sock *sk,
 {
        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
-        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+        if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
                struct inet_diag_entry entry;
-                struct rtattr *bc = (struct rtattr *)(r + 1);
+                const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
+                                                          sizeof(*r),
+                                                          INET_DIAG_REQ_BYTECODE);
                struct inet_sock *inet = inet_sk(sk);
                entry.family = sk->sk_family;
@@ -512,7 +512,7 @@ static int inet_csk_diag_dump(struct sock *sk,
                entry.dport = ntohs(inet->inet_dport);
                entry.userlocks = sk->sk_userlocks;
-                if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+                if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
                        return 0;
        }
@@ -527,9 +527,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
 {
        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
-        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+        if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
                struct inet_diag_entry entry;
-                struct rtattr *bc = (struct rtattr *)(r + 1);
+                const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
+                                                          sizeof(*r),
+                                                          INET_DIAG_REQ_BYTECODE);
                entry.family = tw->tw_family;
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -548,7 +550,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
                entry.dport = ntohs(tw->tw_dport);
                entry.userlocks = 0;
-                if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+                if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
                        return 0;
        }
@@ -618,7 +620,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct listen_sock *lopt;
-        struct rtattr *bc = NULL;
+        const struct nlattr *bc = NULL;
        struct inet_sock *inet = inet_sk(sk);
        int j, s_j;
        int reqnum, s_reqnum;
@@ -638,8 +640,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
        if (!lopt || !lopt->qlen)
                goto out;
-        if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+        if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
-                bc = (struct rtattr *)(r + 1);
+                bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
+                                     INET_DIAG_REQ_BYTECODE);
                entry.sport = inet->inet_num;
                entry.userlocks = sk->sk_userlocks;
        }
@@ -672,8 +675,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
                                        &ireq->rmt_addr;
                                entry.dport = ntohs(ireq->rmt_port);
-                                if (!inet_diag_bc_run(RTA_DATA(bc),
+                                if (!inet_diag_bc_run(nla_data(bc),
-                                                    RTA_PAYLOAD(bc), &entry))
+                                                      nla_len(bc), &entry))
                                        continue;
                        }
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb7ad5a21ff3..3c0369a3a663 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -101,19 +101,42 @@ void inet_put_port(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_put_port);
-void __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
 {
        struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
-        const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
+        unsigned short port = inet_sk(child)->inet_num;
+        const int bhash = inet_bhashfn(sock_net(sk), port,
                        table->bhash_size);
        struct inet_bind_hashbucket *head = &table->bhash[bhash];
        struct inet_bind_bucket *tb;
        spin_lock(&head->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
-        sk_add_bind_node(child, &tb->owners);
+        if (tb->port != port) {
-        inet_csk(child)->icsk_bind_hash = tb;
+                /* NOTE: using tproxy and redirecting skbs to a proxy
+                 * on a different listener port breaks the assumption
+                 * that the listener socket's icsk_bind_hash is the same
+                 * as that of the child socket. We have to look up or
+                 * create a new bind bucket for the child here. */
+                struct hlist_node *node;
+                inet_bind_bucket_for_each(tb, node, &head->chain) {
+                        if (net_eq(ib_net(tb), sock_net(sk)) &&
+                            tb->port == port)
+                                break;
+                }
+                if (!node) {
+                        tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+                                                     sock_net(sk), head, port);
+                        if (!tb) {
+                                spin_unlock(&head->lock);
+                                return -ENOMEM;
+                        }
+                }
+        }
+        inet_bind_hash(child, tb, port);
        spin_unlock(&head->lock);
+        return 0;
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 47038cb6c138..85a0f75dae64 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -51,8 +51,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
 * Basic tcp checks whether packet is suitable for LRO
 */
-static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
+static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
-                            int len, struct net_lro_desc *lro_desc)
+                            int len, const struct net_lro_desc *lro_desc)
 {
        /* check ip header: don't aggregate padded frames */
        if (ntohs(iph->tot_len) != len)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf701..3c8dfa16614d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -505,7 +505,9 @@ restart:
                        }
                        rcu_read_unlock();
+                        local_bh_disable();
                        inet_twsk_deschedule(tw, twdr);
+                        local_bh_enable();
                        inet_twsk_put(tw);
                        goto restart_rcu;
                }
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9ffa24b9a804..ce616d92cc54 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -63,7 +63,7 @@
 *              refcnt: atomically against modifications on other CPU;
 *                 usually under some other lock to prevent node disappearing
 *              dtime: unused node list lock
- *              v4daddr: unchangeable
+ *              daddr: unchangeable
 *              ip_id_count: atomic value (no lock needed)
 */
@@ -72,21 +72,31 @@ static struct kmem_cache *peer_cachep __read_mostly;
 #define node_height(x) x->avl_height
 #define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
 static const struct inet_peer peer_fake_node = {
-        .avl_left       = peer_avl_empty,
+        .avl_left       = peer_avl_empty_rcu,
-        .avl_right      = peer_avl_empty,
+        .avl_right      = peer_avl_empty_rcu,
        .avl_height     = 0
 };
-static struct {
+struct inet_peer_base {
-        struct inet_peer *root;
+        struct inet_peer __rcu *root;
-        spinlock_t      lock;
+        seqlock_t       lock;
        int             total;
-} peers = {
+};
-        .root           = peer_avl_empty,
-        .lock           = __SPIN_LOCK_UNLOCKED(peers.lock),
+static struct inet_peer_base v4_peers = {
+        .root           = peer_avl_empty_rcu,
+        .lock           = __SEQLOCK_UNLOCKED(v4_peers.lock),
+        .total          = 0,
+};
+static struct inet_peer_base v6_peers = {
+        .root           = peer_avl_empty_rcu,
+        .lock           = __SEQLOCK_UNLOCKED(v6_peers.lock),
        .total          = 0,
 };
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
 /* Exported for sysctl_net_ipv4.  */
@@ -144,62 +154,99 @@ void __init inet_initpeers(void)
 /* Called with or without local BH being disabled. */
 static void unlink_from_unused(struct inet_peer *p)
 {
-        if (!list_empty(&p->unused)) {
+        spin_lock_bh(&unused_peers.lock);
-                spin_lock_bh(&unused_peers.lock);
+        list_del_init(&p->unused);
-                list_del_init(&p->unused);
+        spin_unlock_bh(&unused_peers.lock);
-                spin_unlock_bh(&unused_peers.lock);
+}
+static int addr_compare(const struct inetpeer_addr *a,
+                        const struct inetpeer_addr *b)
+{
+        int i, n = (a->family == AF_INET ? 1 : 4);
+        for (i = 0; i < n; i++) {
+                if (a->addr.a6[i] == b->addr.a6[i])
+                        continue;
+                if (a->addr.a6[i] < b->addr.a6[i])
+                        return -1;
+                return 1;
        }
+        return 0;
 }
+#define rcu_deref_locked(X, BASE)                               \
+        rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
 /*
 * Called with local BH disabled and the pool lock held.
 */
-#define lookup(_daddr, _stack)                                  \
+#define lookup(_daddr, _stack, _base)                           \
 ({                                                              \
-        struct inet_peer *u, **v;                               \
+        struct inet_peer *u;                                    \
+        struct inet_peer __rcu **v;                             \
                                                                \
        stackptr = _stack;                                      \
-        *stackptr++ = &peers.root;                              \
+        *stackptr++ = &_base->root;                             \
-        for (u = peers.root; u != peer_avl_empty; ) {           \
+        for (u = rcu_deref_locked(_base->root, _base);          \
-                if (_daddr == u->v4daddr)                       \
+             u != peer_avl_empty; ) {                           \
+                int cmp = addr_compare(_daddr, &u->daddr);      \
+                if (cmp == 0)                                   \
                        break;                                  \
-                if ((__force __u32)_daddr < (__force __u32)u->v4daddr)  \
+                if (cmp == -1)                                  \
                        v = &u->avl_left;                       \
                else                                            \
                        v = &u->avl_right;                      \
                *stackptr++ = v;                                \
-                u = *v;                                         \
+                u = rcu_deref_locked(*v, _base);                \
        }                                                       \
        u;                                                      \
 })
+static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
+{
+        int cur, old = atomic_read(ptr);
+        while (old != u) {
+                *newv = old + a;
+                cur = atomic_cmpxchg(ptr, old, *newv);
+                if (cur == old)
+                        return true;
+                old = cur;
+        }
+        return false;
+}
 /*
- * Called with rcu_read_lock_bh()
+ * Called with rcu_read_lock()
 * Because we hold no lock against a writer, its quite possible we fall
 * in an endless loop.
 * But every pointer we follow is guaranteed to be valid thanks to RCU.
 * We exit from this function if number of links exceeds PEER_MAXDEPTH
 */
-static struct inet_peer *lookup_rcu_bh(__be32 daddr)
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+                                    struct inet_peer_base *base,
+                                    int *newrefcnt)
 {
-        struct inet_peer *u = rcu_dereference_bh(peers.root);
+        struct inet_peer *u = rcu_dereference(base->root);
        int count = 0;
        while (u != peer_avl_empty) {
-                if (daddr == u->v4daddr) {
+                int cmp = addr_compare(daddr, &u->daddr);
+                if (cmp == 0) {
                        /* Before taking a reference, check if this entry was
                         * deleted, unlink_from_pool() sets refcnt=-1 to make
                         * distinction between an unused entry (refcnt=0) and
                         * a freed one.
                         */
-                        if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
+                        if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt))
                                u = NULL;
                        return u;
                }
-                if ((__force __u32)daddr < (__force __u32)u->v4daddr)
+                if (cmp == -1)
-                        u = rcu_dereference_bh(u->avl_left);
+                        u = rcu_dereference(u->avl_left);
                else
-                        u = rcu_dereference_bh(u->avl_right);
+                        u = rcu_dereference(u->avl_right);
                if (unlikely(++count == PEER_MAXDEPTH))
                        break;
        }
@@ -207,15 +254,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
 }
 /* Called with local BH disabled and the pool lock held. */
-#define lookup_rightempty(start)                                \
+#define lookup_rightempty(start, base)                          \
 ({                                                              \
-        struct inet_peer *u, **v;                               \
+        struct inet_peer *u;                                    \
+        struct inet_peer __rcu **v;                             \
        *stackptr++ = &start->avl_left;                         \
        v = &start->avl_left;                                   \
-        for (u = *v; u->avl_right != peer_avl_empty; ) {        \
+        for (u = rcu_deref_locked(*v, base);                    \
+             u->avl_right != peer_avl_empty_rcu; ) {            \
                v = &u->avl_right;                              \
                *stackptr++ = v;                                \
-                u = *v;                                         \
+                u = rcu_deref_locked(*v, base);                 \
        }                                                       \
        u;                                                      \
 })
@@ -224,74 +273,76 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
 * Variable names are the proof of operation correctness.
 * Look into mm/map_avl.c for more detail description of the ideas.
 */
-static void peer_avl_rebalance(struct inet_peer **stack[],
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
-                struct inet_peer ***stackend)
+                               struct inet_peer __rcu ***stackend,
+                               struct inet_peer_base *base)
 {
-        struct inet_peer **nodep, *node, *l, *r;
+        struct inet_peer __rcu **nodep;
+        struct inet_peer *node, *l, *r;
        int lh, rh;
        while (stackend > stack) {
                nodep = *--stackend;
-                node = *nodep;
+                node = rcu_deref_locked(*nodep, base);
-                l = node->avl_left;
+                l = rcu_deref_locked(node->avl_left, base);
-                r = node->avl_right;
+                r = rcu_deref_locked(node->avl_right, base);
                lh = node_height(l);
                rh = node_height(r);
                if (lh > rh + 1) { /* l: RH+2 */
                        struct inet_peer *ll, *lr, *lrl, *lrr;
                        int lrh;
-                        ll = l->avl_left;
+                        ll = rcu_deref_locked(l->avl_left, base);
-                        lr = l->avl_right;
+                        lr = rcu_deref_locked(l->avl_right, base);
                        lrh = node_height(lr);
                        if (lrh <= node_height(ll)) {   /* ll: RH+1 */
-                                node->avl_left = lr;    /* lr: RH or RH+1 */
+                                RCU_INIT_POINTER(node->avl_left, lr);   /* lr: RH or RH+1 */
-                                node->avl_right = r;    /* r: RH */
+                                RCU_INIT_POINTER(node->avl_right, r);   /* r: RH */
                                node->avl_height = lrh + 1; /* RH+1 or RH+2 */
-                                l->avl_left = ll;       /* ll: RH+1 */
+                                RCU_INIT_POINTER(l->avl_left, ll);       /* ll: RH+1 */
-                                l->avl_right = node;    /* node: RH+1 or RH+2 */
+                                RCU_INIT_POINTER(l->avl_right, node);   /* node: RH+1 or RH+2 */
                                l->avl_height = node->avl_height + 1;
-                                *nodep = l;
+                                RCU_INIT_POINTER(*nodep, l);
                        } else { /* ll: RH, lr: RH+1 */
-                                lrl = lr->avl_left;     /* lrl: RH or RH-1 */
+                                lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
-                                lrr = lr->avl_right;    /* lrr: RH or RH-1 */
+                                lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
-                                node->avl_left = lrr;   /* lrr: RH or RH-1 */
+                                RCU_INIT_POINTER(node->avl_left, lrr);  /* lrr: RH or RH-1 */
-                                node->avl_right = r;    /* r: RH */
+                                RCU_INIT_POINTER(node->avl_right, r);   /* r: RH */
                                node->avl_height = rh + 1; /* node: RH+1 */
-                                l->avl_left = ll;       /* ll: RH */
+                                RCU_INIT_POINTER(l->avl_left, ll);      /* ll: RH */
-                                l->avl_right = lrl;     /* lrl: RH or RH-1 */
+                                RCU_INIT_POINTER(l->avl_right, lrl);    /* lrl: RH or RH-1 */
                                l->avl_height = rh + 1; /* l: RH+1 */
-                                lr->avl_left = l;       /* l: RH+1 */
+                                RCU_INIT_POINTER(lr->avl_left, l);      /* l: RH+1 */
-                                lr->avl_right = node;   /* node: RH+1 */
+                                RCU_INIT_POINTER(lr->avl_right, node);  /* node: RH+1 */
                                lr->avl_height = rh + 2;
-                                *nodep = lr;
+                                RCU_INIT_POINTER(*nodep, lr);
                        }
                } else if (rh > lh + 1) { /* r: LH+2 */
                        struct inet_peer *rr, *rl, *rlr, *rll;
                        int rlh;
-                        rr = r->avl_right;
+                        rr = rcu_deref_locked(r->avl_right, base);
-                        rl = r->avl_left;
+                        rl = rcu_deref_locked(r->avl_left, base);
                        rlh = node_height(rl);
                        if (rlh <= node_height(rr)) {   /* rr: LH+1 */
-                                node->avl_right = rl;   /* rl: LH or LH+1 */
+                                RCU_INIT_POINTER(node->avl_right, rl);  /* rl: LH or LH+1 */
-                                node->avl_left = l;     /* l: LH */
+                                RCU_INIT_POINTER(node->avl_left, l);    /* l: LH */
                                node->avl_height = rlh + 1; /* LH+1 or LH+2 */
-                                r->avl_right = rr;      /* rr: LH+1 */
+                                RCU_INIT_POINTER(r->avl_right, rr);     /* rr: LH+1 */
-                                r->avl_left = node;     /* node: LH+1 or LH+2 */
+                                RCU_INIT_POINTER(r->avl_left, node);    /* node: LH+1 or LH+2 */
                                r->avl_height = node->avl_height + 1;
-                                *nodep = r;
+                                RCU_INIT_POINTER(*nodep, r);
                        } else { /* rr: RH, rl: RH+1 */
-                                rlr = rl->avl_right;    /* rlr: LH or LH-1 */
+                                rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
-                                rll = rl->avl_left;     /* rll: LH or LH-1 */
+                                rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
-                                node->avl_right = rll;  /* rll: LH or LH-1 */
+                                RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
-                                node->avl_left = l;     /* l: LH */
+                                RCU_INIT_POINTER(node->avl_left, l);    /* l: LH */
                                node->avl_height = lh + 1; /* node: LH+1 */
-                                r->avl_right = rr;      /* rr: LH */
+                                RCU_INIT_POINTER(r->avl_right, rr);     /* rr: LH */
-                                r->avl_left = rlr;      /* rlr: LH or LH-1 */
+                                RCU_INIT_POINTER(r->avl_left, rlr);     /* rlr: LH or LH-1 */
                                r->avl_height = lh + 1; /* r: LH+1 */
-                                rl->avl_right = r;      /* r: LH+1 */
+                                RCU_INIT_POINTER(rl->avl_right, r);     /* r: LH+1 */
-                                rl->avl_left = node;    /* node: LH+1 */
+                                RCU_INIT_POINTER(rl->avl_left, node);   /* node: LH+1 */
                                rl->avl_height = lh + 2;
-                                *nodep = rl;
+                                RCU_INIT_POINTER(*nodep, rl);
                        }
                } else {
                        node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -300,14 +351,14 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
 }
 /* Called with local BH disabled and the pool lock held. */
-#define link_to_pool(n)                                         \
+#define link_to_pool(n, base)                                   \
 do {                                                            \
        n->avl_height = 1;                                      \
-        n->avl_left = peer_avl_empty;                           \
+        n->avl_left = peer_avl_empty_rcu;                       \
-        n->avl_right = peer_avl_empty;                          \
+        n->avl_right = peer_avl_empty_rcu;                      \
-        smp_wmb(); /* lockless readers can catch us now */      \
+        /* lockless readers can catch us now */                 \
-        **--stackptr = n;                                       \
+        rcu_assign_pointer(**--stackptr, n);                    \
-        peer_avl_rebalance(stack, stackptr);                    \
+        peer_avl_rebalance(stack, stackptr, base);              \
 } while (0)
 static void inetpeer_free_rcu(struct rcu_head *head)
@@ -316,13 +367,14 @@ static void inetpeer_free_rcu(struct rcu_head *head)
 }
 /* May be called with local BH enabled. */
-static void unlink_from_pool(struct inet_peer *p)
+static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
+                             struct inet_peer __rcu **stack[PEER_MAXDEPTH])
 {
        int do_free;
        do_free = 0;
-        spin_lock_bh(&peers.lock);
+        write_seqlock_bh(&base->lock);
        /* Check the reference counter.  It was artificially incremented by 1
         * in cleanup() function to prevent sudden disappearing.  If we can
         * atomically (because of lockless readers) take this last reference,
@@ -330,38 +382,37 @@ static void unlink_from_pool(struct inet_peer *p)
         * We use refcnt=-1 to alert lockless readers this entry is deleted.
         */
        if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
-                struct inet_peer **stack[PEER_MAXDEPTH];
+                struct inet_peer __rcu ***stackptr, ***delp;
-                struct inet_peer ***stackptr, ***delp;
+                if (lookup(&p->daddr, stack, base) != p)
-                if (lookup(p->v4daddr, stack) != p)
                        BUG();
                delp = stackptr - 1; /* *delp[0] == p */
-                if (p->avl_left == peer_avl_empty) {
+                if (p->avl_left == peer_avl_empty_rcu) {
                        *delp[0] = p->avl_right;
                        --stackptr;
                } else {
                        /* look for a node to insert instead of p */
                        struct inet_peer *t;
-                        t = lookup_rightempty(p);
+                        t = lookup_rightempty(p, base);
-                        BUG_ON(*stackptr[-1] != t);
+                        BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
                        **--stackptr = t->avl_left;
-                        /* t is removed, t->v4daddr > x->v4daddr for any
+                        /* t is removed, t->daddr > x->daddr for any
                         * x in p->avl_left subtree.
                         * Put t in the old place of p. */
-                        *delp[0] = t;
+                        RCU_INIT_POINTER(*delp[0], t);
                        t->avl_left = p->avl_left;
                        t->avl_right = p->avl_right;
                        t->avl_height = p->avl_height;
                        BUG_ON(delp[1] != &p->avl_left);
                        delp[1] = &t->avl_left; /* was &p->avl_left */
                }
-                peer_avl_rebalance(stack, stackptr);
+                peer_avl_rebalance(stack, stackptr, base);
-                peers.total--;
+                base->total--;
                do_free = 1;
        }
-        spin_unlock_bh(&peers.lock);
+        write_sequnlock_bh(&base->lock);
        if (do_free)
-                call_rcu_bh(&p->rcu, inetpeer_free_rcu);
+                call_rcu(&p->rcu, inetpeer_free_rcu);
        else
                /* The node is used again.  Decrease the reference counter
                 * back.  The loop "cleanup -> unlink_from_unused
@@ -373,8 +424,18 @@ static void unlink_from_pool(struct inet_peer *p)
                inet_putpeer(p);
 }
+static struct inet_peer_base *family_to_base(int family)
+{
+        return (family == AF_INET ? &v4_peers : &v6_peers);
+}
+static struct inet_peer_base *peer_to_base(struct inet_peer *p)
+{
+        return family_to_base(p->daddr.family);
+}
 /* May be called with local BH enabled. */
-static int cleanup_once(unsigned long ttl)
+static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
 {
        struct inet_peer *p = NULL;
@@ -406,79 +467,101 @@ static int cleanup_once(unsigned long ttl)
                 * happen because of entry limits in route cache. */
                return -1;
-        unlink_from_pool(p);
+        unlink_from_pool(p, peer_to_base(p), stack);
        return 0;
 }
 /* Called with or without local BH being disabled. */
-struct inet_peer *inet_getpeer(__be32 daddr, int create)
+struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
 {
+        struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
+        struct inet_peer_base *base = family_to_base(daddr->family);
        struct inet_peer *p;
-        struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+        unsigned int sequence;
+        int invalidated, newrefcnt = 0;
        /* Look up for the address quickly, lockless.
         * Because of a concurrent writer, we might not find an existing entry.
         */
-        rcu_read_lock_bh();
+        rcu_read_lock();
-        p = lookup_rcu_bh(daddr);
+        sequence = read_seqbegin(&base->lock);
-        rcu_read_unlock_bh();
+        p = lookup_rcu(daddr, base, &newrefcnt);
+        invalidated = read_seqretry(&base->lock, sequence);
+        rcu_read_unlock();
        if (p) {
-                /* The existing node has been found.
+found:          /* The existing node has been found.
                 * Remove the entry from unused list if it was there.
                 */
-                unlink_from_unused(p);
+                if (newrefcnt == 1)
+                        unlink_from_unused(p);
                return p;
        }
+        /* If no writer did a change during our lookup, we can return early. */
+        if (!create && !invalidated)
+                return NULL;
        /* retry an exact lookup, taking the lock before.
         * At least, nodes should be hot in our cache.
         */
-        spin_lock_bh(&peers.lock);
+        write_seqlock_bh(&base->lock);
-        p = lookup(daddr, stack);
+        p = lookup(daddr, stack, base);
        if (p != peer_avl_empty) {
-                atomic_inc(&p->refcnt);
+                newrefcnt = atomic_inc_return(&p->refcnt);
-                spin_unlock_bh(&peers.lock);
+                write_sequnlock_bh(&base->lock);
-                /* Remove the entry from unused list if it was there. */
+                goto found;
-                unlink_from_unused(p);
-                return p;
        }
        p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
        if (p) {
-                p->v4daddr = daddr;
+                p->daddr = *daddr;
                atomic_set(&p->refcnt, 1);
                atomic_set(&p->rid, 0);
-                atomic_set(&p->ip_id_count, secure_ip_id(daddr));
+                atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
                p->tcp_ts_stamp = 0;
+                p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+                p->rate_tokens = 0;
+                p->rate_last = 0;
+                p->pmtu_expires = 0;
+                p->pmtu_orig = 0;
+                memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
                INIT_LIST_HEAD(&p->unused);
                /* Link the node. */
-                link_to_pool(p);
+                link_to_pool(p, base);
-                peers.total++;
+                base->total++;
        }
-        spin_unlock_bh(&peers.lock);
+        write_sequnlock_bh(&base->lock);
-        if (peers.total >= inet_peer_threshold)
+        if (base->total >= inet_peer_threshold)
                /* Remove one less-recently-used entry. */
-                cleanup_once(0);
+                cleanup_once(0, stack);
        return p;
 }
+static int compute_total(void)
+{
+        return v4_peers.total + v6_peers.total;
+}
+EXPORT_SYMBOL_GPL(inet_getpeer);
 /* Called with local BH disabled. */
 static void peer_check_expire(unsigned long dummy)
 {
        unsigned long now = jiffies;
-        int ttl;
+        int ttl, total;
+        struct inet_peer __rcu **stack[PEER_MAXDEPTH];
-        if (peers.total >= inet_peer_threshold)
+        total = compute_total();
+        if (total >= inet_peer_threshold)
                ttl = inet_peer_minttl;
        else
                ttl = inet_peer_maxttl
                                - (inet_peer_maxttl - inet_peer_minttl) / HZ *
-                                        peers.total / inet_peer_threshold * HZ;
+                                        total / inet_peer_threshold * HZ;
-        while (!cleanup_once(ttl)) {
+        while (!cleanup_once(ttl, stack)) {
                if (jiffies != now)
                        break;
        }
@@ -486,13 +569,14 @@ static void peer_check_expire(unsigned long dummy)
        /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
         * interval depending on the total number of entries (more entries,
         * less interval). */
-        if (peers.total >= inet_peer_threshold)
+        total = compute_total();
+        if (total >= inet_peer_threshold)
                peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
        else
                peer_periodic_timer.expires = jiffies
                        + inet_peer_gc_maxtime
                        - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
-                                peers.total / inet_peer_threshold * HZ;
+                                total / inet_peer_threshold * HZ;
        add_timer(&peer_periodic_timer);
 }
@@ -508,3 +592,45 @@ void inet_putpeer(struct inet_peer *p)
        local_bh_enable();
 }
+EXPORT_SYMBOL_GPL(inet_putpeer);
+/*
+ *      Check transmit rate limitation for given message.
+ *      The rate information is held in the inet_peer entries now.
+ *      This function is generic and could be used for other purposes
+ *      too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ *      Note that the same inet_peer fields are modified by functions in
+ *      route.c too, but these work for packet destinations while xrlim_allow
+ *      works for icmp destinations. This means the rate limiting information
+ *      for one "ip object" is shared - and these ICMPs are twice limited:
+ *      by source and by destination.
+ *
+ *      RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ *                        SHOULD allow setting of rate limits
+ *
+ *      Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+        unsigned long now, token;
+        bool rc = false;
+        if (!peer)
+                return true;
+        token = peer->rate_tokens;
+        now = jiffies;
+        token += now - peer->rate_last;
+        peer->rate_last = now;
+        if (token > XRLIM_BURST_FACTOR * timeout)
+                token = XRLIM_BURST_FACTOR * timeout;
+        if (token >= timeout) {
+                token -= timeout;
+                rc = true;
+        }
+        peer->rate_tokens = token;
+        return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 99461f09320f..3b34d1c86270 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb)
        rt = skb_rtable(skb);
-        if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+        if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway)
                goto sr_failed;
        if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde5..0ad6035f6366 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -45,6 +45,7 @@
 #include <linux/udp.h>
 #include <linux/inet.h>
 #include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
 /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -70,11 +71,46 @@ struct ipq {
        __be32          daddr;
        __be16          id;
        u8              protocol;
+        u8              ecn; /* RFC3168 support */
        int             iif;
        unsigned int    rid;
        struct inet_peer *peer;
 };
+/* RFC 3168 support :
+ * We want to check ECN values of all fragments, do detect invalid combinations.
+ * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
+ */
+#define IPFRAG_ECN_NOT_ECT      0x01 /* one frag had ECN_NOT_ECT */
+#define IPFRAG_ECN_ECT_1        0x02 /* one frag had ECN_ECT_1 */
+#define IPFRAG_ECN_ECT_0        0x04 /* one frag had ECN_ECT_0 */
+#define IPFRAG_ECN_CE           0x08 /* one frag had ECN_CE */
+static inline u8 ip4_frag_ecn(u8 tos)
+{
+        return 1 << (tos & INET_ECN_MASK);
+}
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+static const u8 ip4_frag_ecn_table[16] = {
+        /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]                      = INET_ECN_CE,
+        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]                      = INET_ECN_CE,
+        [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]   = INET_ECN_CE,
+        /* invalid combinations : drop frame */
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+        [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
 static struct inet_frags ip4_frags;
 int ip_frag_nqueues(struct net *net)
@@ -116,11 +152,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
        struct ip4_create_arg *arg = a;
        qp = container_of(q, struct ipq, q);
-        return (qp->id == arg->iph->id &&
+        return  qp->id == arg->iph->id &&
                        qp->saddr == arg->iph->saddr &&
                        qp->daddr == arg->iph->daddr &&
                        qp->protocol == arg->iph->protocol &&
-                        qp->user == arg->user);
+                        qp->user == arg->user;
 }
 /* Memory Tracking Functions. */
@@ -137,11 +173,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
        qp->protocol = arg->iph->protocol;
        qp->id = arg->iph->id;
+        qp->ecn = ip4_frag_ecn(arg->iph->tos);
        qp->saddr = arg->iph->saddr;
        qp->daddr = arg->iph->daddr;
        qp->user = arg->user;
        qp->peer = sysctl_ipfrag_max_dist ?
-                inet_getpeer(arg->iph->saddr, 1) : NULL;
+                inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
 }
 static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -204,31 +241,30 @@ static void ip_expire(unsigned long arg)
        if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
                struct sk_buff *head = qp->q.fragments;
+                const struct iphdr *iph;
+                int err;
                rcu_read_lock();
                head->dev = dev_get_by_index_rcu(net, qp->iif);
                if (!head->dev)
                        goto out_rcu_unlock;
+                /* skb dst is stale, drop it, and perform route lookup again */
+                skb_dst_drop(head);
+                iph = ip_hdr(head);
+                err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+                                           iph->tos, head->dev);
+                if (err)
+                        goto out_rcu_unlock;
                /*
-                 * Only search router table for the head fragment,
+                 * Only an end host needs to send an ICMP
-                 * when defraging timeout at PRE_ROUTING HOOK.
+                 * "Fragment Reassembly Timeout" message, per RFC792.
                 */
-                if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
+                if (qp->user == IP_DEFRAG_CONNTRACK_IN &&
-                        const struct iphdr *iph = ip_hdr(head);
+                    skb_rtable(head)->rt_type != RTN_LOCAL)
-                        int err = ip_route_input(head, iph->daddr, iph->saddr,
+                        goto out_rcu_unlock;
-                                                 iph->tos, head->dev);
-                        if (unlikely(err))
-                                goto out_rcu_unlock;
-                        /*
-                         * Only an end host needs to send an ICMP
-                         * "Fragment Reassembly Timeout" message, per RFC792.
-                         */
-                        if (skb_rtable(head)->rt_type != RTN_LOCAL)
-                                goto out_rcu_unlock;
-                }
                /* Send an ICMP "Fragment Reassembly Timeout" message. */
                icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
@@ -316,6 +352,7 @@ static int ip_frag_reinit(struct ipq *qp)
        qp->q.fragments = NULL;
        qp->q.fragments_tail = NULL;
        qp->iif = 0;
+        qp->ecn = 0;
        return 0;
 }
@@ -328,6 +365,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
        int flags, offset;
        int ihl, end;
        int err = -ENOENT;
+        u8 ecn;
        if (qp->q.last_in & INET_FRAG_COMPLETE)
                goto err;
@@ -339,6 +377,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
                goto err;
        }
+        ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
        offset = ntohs(ip_hdr(skb)->frag_off);
        flags = offset & ~IP_OFFSET;
        offset &= IP_OFFSET;
@@ -472,6 +511,7 @@ found:
        }
        qp->q.stamp = skb->tstamp;
        qp->q.meat += skb->len;
+        qp->ecn |= ecn;
        atomic_add(skb->truesize, &qp->q.net->mem);
        if (offset == 0)
                qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -502,9 +542,15 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        int len;
        int ihlen;
        int err;
+        u8 ecn;
        ipq_kill(qp);
+        ecn = ip4_frag_ecn_table[qp->ecn];
+        if (unlikely(ecn == 0xff)) {
+                err = -EINVAL;
+                goto out_fail;
+        }
        /* Make the one we just received the head. */
        if (prev) {
                head = prev->next;
@@ -542,7 +588,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        /* If the first fragment is fragmented itself, we split
         * it to two chunks: the first with data and paged part
         * and the second, holding only fragments. */
-        if (skb_has_frags(head)) {
+        if (skb_has_frag_list(head)) {
                struct sk_buff *clone;
                int i, plen = 0;
@@ -583,6 +629,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        iph = ip_hdr(head);
        iph->frag_off = 0;
        iph->tot_len = htons(len);
+        iph->tos |= ecn;
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
        qp->q.fragments = NULL;
        qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 35c93e8b6a46..8871067560db 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,6 +44,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
+#include <net/gre.h>
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #include <net/ipv6.h>
@@ -63,13 +64,13 @@
   We cannot track such dead loops during route installation,
   it is infeasible task. The most general solutions would be
   to keep skb->encapsulation counter (sort of local ttl),
-   and silently drop packet when it expires. It is the best
+   and silently drop packet when it expires. It is a good
   solution, but it supposes maintaing new variable in ALL
   skb, even if no tunneling is used.
-   Current solution: HARD_TX_LOCK lock breaks dead loops.
+   Current solution: xmit_recursion breaks dead loops. This is a percpu
+   counter, since when we enter the first ndo_xmit(), cpu migration is
+   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
   2. Networking dead loops would not kill routers, but would really
   kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
 static int ipgre_net_id __read_mostly;
 struct ipgre_net {
-        struct ip_tunnel *tunnels[4][HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
        struct net_device *fb_tunnel_dev;
 };
@@ -158,13 +159,40 @@ struct ipgre_net {
 #define tunnels_l       tunnels[1]
 #define tunnels_wc      tunnels[0]
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
 */
-static DEFINE_SPINLOCK(ipgre_lock);
 #define for_each_ip_tunnel_rcu(start) \
        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+        unsigned long   rx_packets;
+        unsigned long   rx_bytes;
+        unsigned long   tx_packets;
+        unsigned long   tx_bytes;
+};
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+        struct pcpu_tstats sum = { 0 };
+        int i;
+        for_each_possible_cpu(i) {
+                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+                sum.rx_packets += tstats->rx_packets;
+                sum.rx_bytes   += tstats->rx_bytes;
+                sum.tx_packets += tstats->tx_packets;
+                sum.tx_bytes   += tstats->tx_bytes;
+        }
+        dev->stats.rx_packets = sum.rx_packets;
+        dev->stats.rx_bytes   = sum.rx_bytes;
+        dev->stats.tx_packets = sum.tx_packets;
+        dev->stats.tx_bytes   = sum.tx_bytes;
+        return &dev->stats;
+}
 /* Given src, dst and key, find appropriate for input tunnel. */
 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 {
        struct net *net = dev_net(dev);
        int link = dev->ifindex;
-        unsigned h0 = HASH(remote);
+        unsigned int h0 = HASH(remote);
-        unsigned h1 = HASH(key);
+        unsigned int h1 = HASH(key);
        struct ip_tunnel *t, *cand = NULL;
        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
        return NULL;
 }
-static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
                struct ip_tunnel_parm *parms)
 {
        __be32 remote = parms->iph.daddr;
        __be32 local = parms->iph.saddr;
        __be32 key = parms->i_key;
-        unsigned h = HASH(key);
+        unsigned int h = HASH(key);
        int prio = 0;
        if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
        return &ign->tunnels[prio][h];
 }
-static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
                struct ip_tunnel *t)
 {
        return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 {
-        struct ip_tunnel **tp = ipgre_bucket(ign, t);
+        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
-        spin_lock_bh(&ipgre_lock);
+        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
-        t->next = *tp;
        rcu_assign_pointer(*tp, t);
-        spin_unlock_bh(&ipgre_lock);
 }
 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 {
-        struct ip_tunnel **tp;
+        struct ip_tunnel __rcu **tp;
+        struct ip_tunnel *iter;
-        for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
-                if (t == *tp) {
+        for (tp = ipgre_bucket(ign, t);
-                        spin_lock_bh(&ipgre_lock);
+             (iter = rtnl_dereference(*tp)) != NULL;
-                        *tp = t->next;
+             tp = &iter->next) {
-                        spin_unlock_bh(&ipgre_lock);
+                if (t == iter) {
+                        rcu_assign_pointer(*tp, t->next);
                        break;
                }
        }
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
        __be32 local = parms->iph.saddr;
        __be32 key = parms->i_key;
        int link = parms->link;
-        struct ip_tunnel *t, **tp;
+        struct ip_tunnel *t;
+        struct ip_tunnel __rcu **tp;
        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
-        for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+        for (tp = __ipgre_bucket(ign, parms);
+             (t = rtnl_dereference(*tp)) != NULL;
+             tp = &t->next)
                if (local == t->parms.iph.saddr &&
                    remote == t->parms.iph.daddr &&
                    key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
        return t;
 }
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
                struct ip_tunnel_parm *parms, int create)
 {
        struct ip_tunnel *t, *nt;
@@ -375,19 +405,14 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
        if (parms->name[0])
                strlcpy(name, parms->name, IFNAMSIZ);
        else
-                sprintf(name, "gre%%d");
+                strcpy(name, "gre%d");
        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
        if (!dev)
-          return NULL;
+                return NULL;
        dev_net_set(dev, net);
-        if (strchr(name, '%')) {
-                if (dev_alloc_name(dev, name) < 0)
-                        goto failed_free;
-        }
        nt = netdev_priv(dev);
        nt->parms = *parms;
        dev->rtnl_link_ops = &ipgre_link_ops;
@@ -432,7 +457,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
   by themself???
 */
-        struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
        int grehlen = (iph->ihl<<2) + 4;
        const int type = icmp_hdr(skb)->type;
@@ -504,7 +529,7 @@ out:
        rcu_read_unlock();
 }
-static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
 {
        if (INET_ECN_is_ce(iph->tos)) {
                if (skb->protocol == htons(ETH_P_IP)) {
@@ -516,19 +541,19 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
 }
 static inline u8
-ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 {
        u8 inner = 0;
        if (skb->protocol == htons(ETH_P_IP))
                inner = old_iph->tos;
        else if (skb->protocol == htons(ETH_P_IPV6))
-                inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
+                inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
        return INET_ECN_encapsulate(tos, inner);
 }
 static int ipgre_rcv(struct sk_buff *skb)
 {
-        struct iphdr *iph;
+        const struct iphdr *iph;
        u8     *h;
        __be16    flags;
        __sum16   csum = 0;
@@ -582,7 +607,7 @@ static int ipgre_rcv(struct sk_buff *skb)
        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
                                          iph->saddr, iph->daddr, key,
                                          gre_proto))) {
-                struct net_device_stats *stats = &tunnel->dev->stats;
+                struct pcpu_tstats *tstats;
                secpath_reset(skb);
@@ -604,24 +629,24 @@ static int ipgre_rcv(struct sk_buff *skb)
 #ifdef CONFIG_NET_IPGRE_BROADCAST
                if (ipv4_is_multicast(iph->daddr)) {
                        /* Looped back packet, drop it! */
-                        if (skb_rtable(skb)->fl.iif == 0)
+                        if (rt_is_output_route(skb_rtable(skb)))
                                goto drop;
-                        stats->multicast++;
+                        tunnel->dev->stats.multicast++;
                        skb->pkt_type = PACKET_BROADCAST;
                }
 #endif
                if (((flags&GRE_CSUM) && csum) ||
                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
-                        stats->rx_crc_errors++;
+                        tunnel->dev->stats.rx_crc_errors++;
-                        stats->rx_errors++;
+                        tunnel->dev->stats.rx_errors++;
                        goto drop;
                }
                if (tunnel->parms.i_flags&GRE_SEQ) {
                        if (!(flags&GRE_SEQ) ||
                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
-                                stats->rx_fifo_errors++;
+                                tunnel->dev->stats.rx_fifo_errors++;
-                                stats->rx_errors++;
+                                tunnel->dev->stats.rx_errors++;
                                goto drop;
                        }
                        tunnel->i_seqno = seqno + 1;
@@ -630,8 +655,8 @@ static int ipgre_rcv(struct sk_buff *skb)
                /* Warning: All skb pointers will be invalidated! */
                if (tunnel->dev->type == ARPHRD_ETHER) {
                        if (!pskb_may_pull(skb, ETH_HLEN)) {
-                                stats->rx_length_errors++;
+                                tunnel->dev->stats.rx_length_errors++;
-                                stats->rx_errors++;
+                                tunnel->dev->stats.rx_errors++;
                                goto drop;
                        }
@@ -640,14 +665,19 @@ static int ipgre_rcv(struct sk_buff *skb)
                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
                }
-                skb_tunnel_rx(skb, tunnel->dev);
+                tstats = this_cpu_ptr(tunnel->dev->tstats);
+                tstats->rx_packets++;
+                tstats->rx_bytes += skb->len;
+                __skb_tunnel_rx(skb, tunnel->dev);
                skb_reset_network_header(skb);
                ipgre_ecn_decapsulate(iph, skb);
                netif_rx(skb);
                rcu_read_unlock();
-                return(0);
+                return 0;
        }
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -655,20 +685,20 @@ drop:
        rcu_read_unlock();
 drop_nolock:
        kfree_skb(skb);
-        return(0);
+        return 0;
 }
 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct net_device_stats *stats = &dev->stats;
+        struct pcpu_tstats *tstats;
-        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+        const struct iphdr  *old_iph = ip_hdr(skb);
-        struct iphdr  *old_iph = ip_hdr(skb);
+        const struct iphdr  *tiph;
-        struct iphdr  *tiph;
+        struct flowi4 fl4;
        u8     tos;
        __be16 df;
        struct rtable *rt;                      /* Route to the other host */
-        struct net_device *tdev;                        /* Device to other host */
+        struct net_device *tdev;                /* Device to other host */
        struct iphdr  *iph;                     /* Our new IP header */
        unsigned int max_headroom;              /* The extra header space needed */
        int    gre_hlen;
@@ -680,7 +710,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
                gre_hlen = 0;
-                tiph = (struct iphdr *)skb->data;
+                tiph = (const struct iphdr *)skb->data;
        } else {
                gre_hlen = tunnel->hlen;
                tiph = &tunnel->parms.iph;
@@ -690,7 +720,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                /* NBMA tunnel */
                if (skb_dst(skb) == NULL) {
-                        stats->tx_fifo_errors++;
+                        dev->stats.tx_fifo_errors++;
                        goto tx_error;
                }
@@ -701,14 +731,14 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                else if (skb->protocol == htons(ETH_P_IPV6)) {
-                        struct in6_addr *addr6;
+                        const struct in6_addr *addr6;
                        int addr_type;
                        struct neighbour *neigh = skb_dst(skb)->neighbour;
                        if (neigh == NULL)
                                goto tx_error;
-                        addr6 = (struct in6_addr *)&neigh->primary_key;
+                        addr6 = (const struct in6_addr *)&neigh->primary_key;
                        addr_type = ipv6_addr_type(addr6);
                        if (addr_type == IPV6_ADDR_ANY) {
@@ -732,26 +762,21 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                if (skb->protocol == htons(ETH_P_IP))
                        tos = old_iph->tos;
                else if (skb->protocol == htons(ETH_P_IPV6))
-                        tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
+                        tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
        }
-        {
+        rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
-                struct flowi fl = { .oif = tunnel->parms.link,
+                                 tunnel->parms.o_key, RT_TOS(tos),
-                                    .nl_u = { .ip4_u =
+                                 tunnel->parms.link);
-                                              { .daddr = dst,
+        if (IS_ERR(rt)) {
-                                                .saddr = tiph->saddr,
+                dev->stats.tx_carrier_errors++;
-                                                .tos = RT_TOS(tos) } },
+                goto tx_error;
-                                    .proto = IPPROTO_GRE };
-                if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-                        stats->tx_carrier_errors++;
-                        goto tx_error;
-                }
        }
        tdev = rt->dst.dev;
        if (tdev == dev) {
                ip_rt_put(rt);
-                stats->collisions++;
+                dev->stats.collisions++;
                goto tx_error;
        }
@@ -783,7 +808,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
                            rt6->rt6i_dst.plen == 128) {
                                rt6->rt6i_flags |= RTF_MODIFIED;
-                                skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
+                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
                        }
                }
@@ -814,7 +839,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                        dev->needed_headroom = max_headroom;
                if (!new_skb) {
                        ip_rt_put(rt);
-                        txq->tx_dropped++;
+                        dev->stats.tx_dropped++;
                        dev_kfree_skb(skb);
                        return NETDEV_TX_OK;
                }
@@ -844,18 +869,18 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
        iph->frag_off           =       df;
        iph->protocol           =       IPPROTO_GRE;
        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
-        iph->daddr              =       rt->rt_dst;
+        iph->daddr              =       fl4.daddr;
-        iph->saddr              =       rt->rt_src;
+        iph->saddr              =       fl4.saddr;
        if ((iph->ttl = tiph->ttl) == 0) {
                if (skb->protocol == htons(ETH_P_IP))
                        iph->ttl = old_iph->ttl;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                else if (skb->protocol == htons(ETH_P_IPV6))
-                        iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
+                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 #endif
                else
-                        iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
+                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
        }
        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -881,15 +906,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
        }
        nf_reset(skb);
+        tstats = this_cpu_ptr(dev->tstats);
-        IPTUNNEL_XMIT();
+        __IPTUNNEL_XMIT(tstats, &dev->stats);
        return NETDEV_TX_OK;
 tx_error_icmp:
        dst_link_failure(skb);
 tx_error:
-        stats->tx_errors++;
+        dev->stats.tx_errors++;
        dev_kfree_skb(skb);
        return NETDEV_TX_OK;
 }
@@ -898,7 +923,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
 {
        struct net_device *tdev = NULL;
        struct ip_tunnel *tunnel;
-        struct iphdr *iph;
+        const struct iphdr *iph;
        int hlen = LL_MAX_HEADER;
        int mtu = ETH_DATA_LEN;
        int addend = sizeof(struct iphdr) + 4;
@@ -909,14 +934,15 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
        /* Guess output device to choose reasonable mtu and needed_headroom */
        if (iph->daddr) {
-                struct flowi fl = { .oif = tunnel->parms.link,
+                struct flowi4 fl4;
-                                    .nl_u = { .ip4_u =
-                                              { .daddr = iph->daddr,
-                                                .saddr = iph->saddr,
-                                                .tos = RT_TOS(iph->tos) } },
-                                    .proto = IPPROTO_GRE };
                struct rtable *rt;
-                if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
+                rt = ip_route_output_gre(dev_net(dev), &fl4,
+                                         iph->daddr, iph->saddr,
+                                         tunnel->parms.o_key,
+                                         RT_TOS(iph->tos),
+                                         tunnel->parms.link);
+                if (!IS_ERR(rt)) {
                        tdev = rt->dst.dev;
                        ip_rt_put(rt);
                }
@@ -1012,7 +1038,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
                                        break;
                                }
                        } else {
-                                unsigned nflags = 0;
+                                unsigned int nflags = 0;
                                t = netdev_priv(dev);
@@ -1026,6 +1052,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
                                        break;
                                }
                                ipgre_tunnel_unlink(ign, t);
+                                synchronize_net();
                                t->parms.iph.saddr = p.iph.saddr;
                                t->parms.iph.daddr = p.iph.daddr;
                                t->parms.i_key = p.i_key;
@@ -1125,7 +1152,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
                        unsigned short type,
-                        const void *daddr, const void *saddr, unsigned len)
+                        const void *daddr, const void *saddr, unsigned int len)
 {
        struct ip_tunnel *t = netdev_priv(dev);
        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1151,7 +1178,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 {
-        struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
+        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
        memcpy(haddr, &iph->saddr, 4);
        return 4;
 }
@@ -1167,14 +1194,16 @@ static int ipgre_open(struct net_device *dev)
        struct ip_tunnel *t = netdev_priv(dev);
        if (ipv4_is_multicast(t->parms.iph.daddr)) {
-                struct flowi fl = { .oif = t->parms.link,
+                struct flowi4 fl4;
-                                    .nl_u = { .ip4_u =
-                                              { .daddr = t->parms.iph.daddr,
-                                                .saddr = t->parms.iph.saddr,
-                                                .tos = RT_TOS(t->parms.iph.tos) } },
-                                    .proto = IPPROTO_GRE };
                struct rtable *rt;
-                if (ip_route_output_key(dev_net(dev), &rt, &fl))
+                rt = ip_route_output_gre(dev_net(dev), &fl4,
+                                         t->parms.iph.daddr,
+                                         t->parms.iph.saddr,
+                                         t->parms.o_key,
+                                         RT_TOS(t->parms.iph.tos),
+                                         t->parms.link);
+                if (IS_ERR(rt))
                        return -EADDRNOTAVAIL;
                dev = rt->dst.dev;
                ip_rt_put(rt);
@@ -1193,10 +1222,8 @@ static int ipgre_close(struct net_device *dev)
        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
                struct in_device *in_dev;
                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
-                if (in_dev) {
+                if (in_dev)
                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
-                        in_dev_put(in_dev);
-                }
        }
        return 0;
 }
@@ -1213,12 +1240,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
        .ndo_start_xmit         = ipgre_tunnel_xmit,
        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
+        .ndo_get_stats          = ipgre_get_stats,
 };
+static void ipgre_dev_free(struct net_device *dev)
+{
+        free_percpu(dev->tstats);
+        free_netdev(dev);
+}
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
        dev->netdev_ops         = &ipgre_netdev_ops;
-        dev->destructor         = free_netdev;
+        dev->destructor         = ipgre_dev_free;
        dev->type               = ARPHRD_IPGRE;
        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1256,6 +1290,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
        } else
                dev->header_ops = &ipgre_header_ops;
+        dev->tstats = alloc_percpu(struct pcpu_tstats);
+        if (!dev->tstats)
+                return -ENOMEM;
        return 0;
 }
@@ -1263,7 +1301,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;
-        struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
        tunnel->dev = dev;
        strcpy(tunnel->parms.name, dev->name);
@@ -1274,14 +1311,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
        tunnel->hlen            = sizeof(struct iphdr) + 4;
        dev_hold(dev);
-        ign->tunnels_wc[0]      = tunnel;
 }
-static const struct net_protocol ipgre_protocol = {
+static const struct gre_protocol ipgre_protocol = {
-        .handler        =       ipgre_rcv,
+        .handler     = ipgre_rcv,
-        .err_handler    =       ipgre_err,
+        .err_handler = ipgre_err,
-        .netns_ok       =       1,
 };
 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1291,11 +1326,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
        for (prio = 0; prio < 4; prio++) {
                int h;
                for (h = 0; h < HASH_SIZE; h++) {
-                        struct ip_tunnel *t = ign->tunnels[prio][h];
+                        struct ip_tunnel *t;
+                        t = rtnl_dereference(ign->tunnels[prio][h]);
                        while (t != NULL) {
                                unregister_netdevice_queue(t->dev, head);
-                                t = t->next;
+                                t = rtnl_dereference(t->next);
                        }
                }
        }
@@ -1320,10 +1357,12 @@ static int __net_init ipgre_init_net(struct net *net)
        if ((err = register_netdev(ign->fb_tunnel_dev)))
                goto err_reg_dev;
+        rcu_assign_pointer(ign->tunnels_wc[0],
+                           netdev_priv(ign->fb_tunnel_dev));
        return 0;
 err_reg_dev:
-        free_netdev(ign->fb_tunnel_dev);
+        ipgre_dev_free(ign->fb_tunnel_dev);
 err_alloc_dev:
        return err;
 }
@@ -1441,6 +1480,10 @@ static int ipgre_tap_init(struct net_device *dev)
        ipgre_tunnel_bind_dev(dev);
+        dev->tstats = alloc_percpu(struct pcpu_tstats);
+        if (!dev->tstats)
+                return -ENOMEM;
        return 0;
 }
@@ -1451,6 +1494,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
        .ndo_set_mac_address    = eth_mac_addr,
        .ndo_validate_addr      = eth_validate_addr,
        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
+        .ndo_get_stats          = ipgre_get_stats,
 };
 static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1503,7 @@ static void ipgre_tap_setup(struct net_device *dev)
        ether_setup(dev);
        dev->netdev_ops         = &ipgre_tap_netdev_ops;
-        dev->destructor         = free_netdev;
+        dev->destructor         = ipgre_dev_free;
        dev->iflink             = 0;
        dev->features           |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1531,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
        if (!tb[IFLA_MTU])
                dev->mtu = mtu;
+        /* Can use a lockless transmit, unless we generate output sequences */
+        if (!(nt->parms.o_flags & GRE_SEQ))
+                dev->features |= NETIF_F_LLTX;
        err = register_netdevice(dev);
        if (err)
                goto out;
@@ -1522,7 +1570,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
                t = nt;
                if (dev->type != ARPHRD_ETHER) {
-                        unsigned nflags = 0;
+                        unsigned int nflags = 0;
                        if (ipv4_is_multicast(p.iph.daddr))
                                nflags = IFF_BROADCAST;
@@ -1663,7 +1711,7 @@ static int __init ipgre_init(void)
        if (err < 0)
                return err;
-        err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
+        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
        if (err < 0) {
                printk(KERN_INFO "ipgre init: can't add protocol\n");
                goto add_proto_failed;
@@ -1683,7 +1731,7 @@ out:
 tap_ops_failed:
        rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
-        inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
        unregister_pernet_device(&ipgre_net_ops);
        goto out;
@@ -1693,7 +1741,7 @@ static void __exit ipgre_fini(void)
 {
        rtnl_link_unregister(&ipgre_tap_ops);
        rtnl_link_unregister(&ipgre_link_ops);
-        if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
                printk(KERN_INFO "ipgre close: can't remove protocol\n");
        unregister_pernet_device(&ipgre_net_ops);
 }
@@ -1703,3 +1751,4 @@ module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_RTNL_LINK("gre");
 MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_NETDEV("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..c8f48efc5fd3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -268,7 +268,7 @@ int ip_local_deliver(struct sk_buff *skb)
 static inline int ip_rcv_options(struct sk_buff *skb)
 {
        struct ip_options *opt;
-        struct iphdr *iph;
+        const struct iphdr *iph;
        struct net_device *dev = skb->dev;
        /* It looks as overkill, because not all
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
                }
        }
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        if (unlikely(skb_dst(skb)->tclassid)) {
                struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
                u32 idx = skb_dst(skb)->tclassid;
@@ -374,7 +374,7 @@ drop:
 */
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
-        struct iphdr *iph;
+        const struct iphdr *iph;
        u32 len;
        /* When the interface is in promisc. mode, drop all the crap
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..ec93335901dd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <asm/uaccess.h>
+#include <asm/unaligned.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/icmp.h>
@@ -36,8 +37,8 @@
 * saddr is address of outgoing interface.
 */
-void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
-                            __be32 daddr, struct rtable *rt, int is_frag)
+                      __be32 daddr, struct rtable *rt, int is_frag)
 {
        unsigned char *iph = skb_network_header(skb);
@@ -50,9 +51,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
        if (!is_frag) {
                if (opt->rr_needaddr)
-                        ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+                        ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
                if (opt->ts_needaddr)
-                        ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+                        ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
                if (opt->ts_needtime) {
                        struct timespec tv;
                        __be32 midtime;
@@ -83,9 +84,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
 * NOTE: dopt cannot point to skb.
 */
-int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
+int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 {
-        struct ip_options *sopt;
+        const struct ip_options *sopt;
        unsigned char *sptr, *dptr;
        int soffset, doffset;
        int     optlen;
@@ -95,10 +96,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
        sopt = &(IPCB(skb)->opt);
-        if (sopt->optlen == 0) {
+        if (sopt->optlen == 0)
-                dopt->optlen = 0;
                return 0;
-        }
        sptr = skb_network_header(skb);
        dptr = dopt->__data;
@@ -140,11 +139,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
                                } else {
                                        dopt->ts_needtime = 0;
-                                        if (soffset + 8 <= optlen) {
+                                        if (soffset + 7 <= optlen) {
                                                __be32 addr;
-                                                memcpy(&addr, sptr+soffset-1, 4);
+                                                memcpy(&addr, dptr+soffset-1, 4);
-                                                if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) {
+                                                if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
                                                        dopt->ts_needtime = 1;
                                                        soffset += 8;
                                                }
@@ -157,7 +156,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
                dopt->optlen += optlen;
        }
        if (sopt->srr) {
-                unsigned char * start = sptr+sopt->srr;
+                unsigned char *start = sptr+sopt->srr;
                __be32 faddr;
                optlen  = start[1];
@@ -329,7 +328,7 @@ int ip_options_compile(struct net *net,
                                        pp_ptr = optptr + 2;
                                        goto error;
                                }
-                                if (skb) {
+                                if (rt) {
                                        memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
                                        opt->is_changed = 1;
                                }
@@ -352,7 +351,7 @@ int ip_options_compile(struct net *net,
                                goto error;
                        }
                        if (optptr[2] <= optlen) {
-                                __be32 *timeptr = NULL;
+                                unsigned char *timeptr = NULL;
                                if (optptr[2]+3 > optptr[1]) {
                                        pp_ptr = optptr + 2;
                                        goto error;
@@ -361,7 +360,7 @@ int ip_options_compile(struct net *net,
                                      case IPOPT_TS_TSONLY:
                                        opt->ts = optptr - iph;
                                        if (skb)
-                                                timeptr = (__be32*)&optptr[optptr[2]-1];
+                                                timeptr = &optptr[optptr[2]-1];
                                        opt->ts_needtime = 1;
                                        optptr[2] += 4;
                                        break;
@@ -371,9 +370,9 @@ int ip_options_compile(struct net *net,
                                                goto error;
                                        }
                                        opt->ts = optptr - iph;
-                                        if (skb) {
+                                        if (rt)  {
                                                memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
-                                                timeptr = (__be32*)&optptr[optptr[2]+3];
+                                                timeptr = &optptr[optptr[2]+3];
                                        }
                                        opt->ts_needaddr = 1;
                                        opt->ts_needtime = 1;
@@ -391,7 +390,7 @@ int ip_options_compile(struct net *net,
                                                if (inet_addr_type(net, addr) == RTN_UNICAST)
                                                        break;
                                                if (skb)
-                                                        timeptr = (__be32*)&optptr[optptr[2]+3];
+                                                        timeptr = &optptr[optptr[2]+3];
                                        }
                                        opt->ts_needtime = 1;
                                        optptr[2] += 8;
@@ -405,10 +404,10 @@ int ip_options_compile(struct net *net,
                                }
                                if (timeptr) {
                                        struct timespec tv;
-                                        __be32  midtime;
+                                        u32  midtime;
                                        getnstimeofday(&tv);
-                                        midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+                                        midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
-                                        memcpy(timeptr, &midtime, sizeof(__be32));
+                                        put_unaligned_be32(midtime, timeptr);
                                        opt->is_changed = 1;
                                }
                        } else {
@@ -466,7 +465,7 @@ error:
        }
        return -EINVAL;
 }
+EXPORT_SYMBOL(ip_options_compile);
 /*
 *      Undo all the changes done by ip_options_compile().
@@ -499,19 +498,19 @@ void ip_options_undo(struct ip_options * opt)
        }
 }
-static struct ip_options *ip_options_get_alloc(const int optlen)
+static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
 {
-        return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3),
+        return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
                       GFP_KERNEL);
 }
-static int ip_options_get_finish(struct net *net, struct ip_options **optp,
+static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
-                                 struct ip_options *opt, int optlen)
+                                 struct ip_options_rcu *opt, int optlen)
 {
        while (optlen & 3)
-                opt->__data[optlen++] = IPOPT_END;
+                opt->opt.__data[optlen++] = IPOPT_END;
-        opt->optlen = optlen;
+        opt->opt.optlen = optlen;
-        if (optlen && ip_options_compile(net, opt, NULL)) {
+        if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
                kfree(opt);
                return -EINVAL;
        }
@@ -520,29 +519,29 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp,
        return 0;
 }
-int ip_options_get_from_user(struct net *net, struct ip_options **optp,
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
                             unsigned char __user *data, int optlen)
 {
-        struct ip_options *opt = ip_options_get_alloc(optlen);
+        struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
        if (!opt)
                return -ENOMEM;
-        if (optlen && copy_from_user(opt->__data, data, optlen)) {
+        if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
                kfree(opt);
                return -EFAULT;
        }
        return ip_options_get_finish(net, optp, opt, optlen);
 }
-int ip_options_get(struct net *net, struct ip_options **optp,
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
                   unsigned char *data, int optlen)
 {
-        struct ip_options *opt = ip_options_get_alloc(optlen);
+        struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
        if (!opt)
                return -ENOMEM;
        if (optlen)
-                memcpy(opt->__data, data, optlen);
+                memcpy(opt->opt.__data, data, optlen);
        return ip_options_get_finish(net, optp, opt, optlen);
 }
@@ -555,7 +554,7 @@ void ip_forward_options(struct sk_buff *skb)
        if (opt->rr_needaddr) {
                optptr = (unsigned char *)raw + opt->rr;
-                ip_rt_get_source(&optptr[optptr[2]-5], rt);
+                ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
                opt->is_changed = 1;
        }
        if (opt->srr_is_hit) {
@@ -569,19 +568,18 @@ void ip_forward_options(struct sk_buff *skb)
                     ) {
                        if (srrptr + 3 > srrspace)
                                break;
-                        if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+                        if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0)
                                break;
                }
                if (srrptr + 3 <= srrspace) {
                        opt->is_changed = 1;
-                        ip_rt_get_source(&optptr[srrptr-1], rt);
+                        ip_rt_get_source(&optptr[srrptr-1], skb, rt);
-                        ip_hdr(skb)->daddr = rt->rt_dst;
                        optptr[2] = srrptr+4;
                } else if (net_ratelimit())
                        printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
                if (opt->ts_needaddr) {
                        optptr = raw + opt->ts;
-                        ip_rt_get_source(&optptr[optptr[2]-9], rt);
+                        ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
                        opt->is_changed = 1;
                }
        }
@@ -603,7 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
        unsigned long orefdst;
        int err;
-        if (!opt->srr)
+        if (!rt)
                return 0;
        if (skb->pkt_type != PACKET_HOST)
@@ -637,7 +635,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
                if (rt2->rt_type != RTN_LOCAL)
                        break;
                /* Superfast 8) loopback forward */
-                memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+                iph->daddr = nexthop;
                opt->is_changed = 1;
        }
        if (srrptr <= srrspace) {
@@ -646,3 +644,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
        }
        return 0;
 }
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7649d7750075..84f26e8e6c60 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,6 +82,7 @@
 #include <linux/tcp.h>
 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
 /* Generate a checksum for an outgoing IP datagram. */
 __inline__ void ip_send_check(struct iphdr *iph)
@@ -130,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
        int ttl = inet->uc_ttl;
        if (ttl < 0)
-                ttl = dst_metric(dst, RTAX_HOPLIMIT);
+                ttl = ip4_dst_hoplimit(dst);
        return ttl;
 }
@@ -139,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 *
 */
 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
-                          __be32 saddr, __be32 daddr, struct ip_options *opt)
+                          __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
 {
        struct inet_sock *inet = inet_sk(sk);
        struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph;
        /* Build the IP header. */
-        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        iph->version  = 4;
@@ -157,14 +158,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
        else
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
-        iph->daddr    = rt->rt_dst;
+        iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
-        iph->saddr    = rt->rt_src;
+        iph->saddr    = saddr;
        iph->protocol = sk->sk_protocol;
        ip_select_ident(iph, &rt->dst, sk);
-        if (opt && opt->optlen) {
+        if (opt && opt->opt.optlen) {
-                iph->ihl += opt->optlen>>2;
+                iph->ihl += opt->opt.optlen>>2;
-                ip_options_build(skb, opt, daddr, rt, 0);
+                ip_options_build(skb, &opt->opt, daddr, rt, 0);
        }
        skb->priority = sk->sk_priority;
@@ -311,11 +312,12 @@ int ip_output(struct sk_buff *skb)
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-int ip_queue_xmit(struct sk_buff *skb)
+int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 {
        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(sk);
-        struct ip_options *opt = inet->opt;
+        struct ip_options_rcu *inet_opt;
+        struct flowi4 *fl4;
        struct rtable *rt;
        struct iphdr *iph;
        int res;
@@ -324,6 +326,8 @@ int ip_queue_xmit(struct sk_buff *skb)
         * f.e. by something like SCTP.
         */
        rcu_read_lock();
+        inet_opt = rcu_dereference(inet->inet_opt);
+        fl4 = &fl->u.ip4;
        rt = skb_rtable(skb);
        if (rt != NULL)
                goto packet_routed;
@@ -335,40 +339,32 @@ int ip_queue_xmit(struct sk_buff *skb)
                /* Use correct destination address if we have options. */
                daddr = inet->inet_daddr;
-                if(opt && opt->srr)
+                if (inet_opt && inet_opt->opt.srr)
-                        daddr = opt->faddr;
+                        daddr = inet_opt->opt.faddr;
-                {
+                /* If this fails, retransmit mechanism of transport layer will
-                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                 * keep trying until route appears or the connection times
-                                            .mark = sk->sk_mark,
+                 * itself out.
-                                            .nl_u = { .ip4_u =
+                 */
-                                                      { .daddr = daddr,
+                rt = ip_route_output_ports(sock_net(sk), fl4, sk,
-                                                        .saddr = inet->inet_saddr,
+                                           daddr, inet->inet_saddr,
-                                                        .tos = RT_CONN_FLAGS(sk) } },
+                                           inet->inet_dport,
-                                            .proto = sk->sk_protocol,
+                                           inet->inet_sport,
-                                            .flags = inet_sk_flowi_flags(sk),
+                                           sk->sk_protocol,
-                                            .uli_u = { .ports =
+                                           RT_CONN_FLAGS(sk),
-                                                       { .sport = inet->inet_sport,
+                                           sk->sk_bound_dev_if);
-                                                         .dport = inet->inet_dport } } };
+                if (IS_ERR(rt))
+                        goto no_route;
-                        /* If this fails, retransmit mechanism of transport layer will
-                         * keep trying until route appears or the connection times
-                         * itself out.
-                         */
-                        security_sk_classify_flow(sk, &fl);
-                        if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
-                                goto no_route;
-                }
                sk_setup_caps(sk, &rt->dst);
        }
        skb_dst_set_noref(skb, &rt->dst);
 packet_routed:
-        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+        if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
                goto no_route;
        /* OK, we know where to send it, allocate and build IP header. */
-        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+        skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
@@ -378,13 +374,13 @@ packet_routed:
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->protocol = sk->sk_protocol;
-        iph->saddr    = rt->rt_src;
+        iph->saddr    = fl4->saddr;
-        iph->daddr    = rt->rt_dst;
+        iph->daddr    = fl4->daddr;
        /* Transport layer set skb->h.foo itself. */
-        if (opt && opt->optlen) {
+        if (inet_opt && inet_opt->opt.optlen) {
-                iph->ihl += opt->optlen >> 2;
+                iph->ihl += inet_opt->opt.optlen >> 2;
-                ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
+                ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
        }
        ip_select_ident_more(iph, &rt->dst, sk,
@@ -487,7 +483,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
         * LATER: this step can be merged to real generation of fragments,
         * we can switch to copy when see the first bad fragment.
         */
-        if (skb_has_frags(skb)) {
+        if (skb_has_frag_list(skb)) {
                struct sk_buff *frag, *frag2;
                int first_len = skb_pagelen(skb);
@@ -610,7 +606,7 @@ slow_path:
                /* IF: it doesn't fit, use 'mtu' - the data space left */
                if (len > mtu)
                        len = mtu;
-                /* IF: we are not sending upto and including the packet end
+                /* IF: we are not sending up to and including the packet end
                   then align the next start on an eight byte boundary */
                if (len < left) {
                        len &= ~7;
@@ -734,6 +730,7 @@ csum_page(struct page *page, int offset, int copy)
 }
 static inline int ip_ufo_append_data(struct sock *sk,
+                        struct sk_buff_head *queue,
                        int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                        void *from, int length, int hh_len, int fragheaderlen,
@@ -746,7 +743,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
         * device, so create one single skb packet containing complete
         * udp datagram
         */
-        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+        if ((skb = skb_peek_tail(queue)) == NULL) {
                skb = sock_alloc_send_skb(sk,
                        hh_len + fragheaderlen + transhdrlen + 20,
                        (flags & MSG_DONTWAIT), &err);
@@ -768,40 +765,30 @@ static inline int ip_ufo_append_data(struct sock *sk,
                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum = 0;
-                sk->sk_sndmsg_off = 0;
                /* specify the length of each IP datagram fragment */
                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-                __skb_queue_tail(&sk->sk_write_queue, skb);
+                __skb_queue_tail(queue, skb);
        }
        return skb_append_datato_frags(sk, skb, getfrag, from,
                                       (length - transhdrlen));
 }
-/*
+static int __ip_append_data(struct sock *sk,
- *      ip_append_data() and ip_append_page() can make one large IP datagram
+                            struct flowi4 *fl4,
- *      from many pieces of data. Each pieces will be holded on the socket
+                            struct sk_buff_head *queue,
- *      until ip_push_pending_frames() is called. Each piece can be a page
+                            struct inet_cork *cork,
- *      or non-page data.
+                            int getfrag(void *from, char *to, int offset,
- *
+                                        int len, int odd, struct sk_buff *skb),
- *      Not only UDP, other transport protocols - e.g. raw sockets - can use
+                            void *from, int length, int transhdrlen,
- *      this interface potentially.
+                            unsigned int flags)
- *
- *      LATER: length must be adjusted by pad at tail, when it is required.
- */
-int ip_append_data(struct sock *sk,
-                   int getfrag(void *from, char *to, int offset, int len,
-                               int odd, struct sk_buff *skb),
-                   void *from, int length, int transhdrlen,
-                   struct ipcm_cookie *ipc, struct rtable **rtp,
-                   unsigned int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
        struct sk_buff *skb;
-        struct ip_options *opt = NULL;
+        struct ip_options *opt = cork->opt;
        int hh_len;
        int exthdrlen;
        int mtu;
@@ -810,60 +797,20 @@ int ip_append_data(struct sock *sk,
        int offset = 0;
        unsigned int maxfraglen, fragheaderlen;
        int csummode = CHECKSUM_NONE;
-        struct rtable *rt;
+        struct rtable *rt = (struct rtable *)cork->dst;
-        if (flags&MSG_PROBE)
+        skb = skb_peek_tail(queue);
-                return 0;
-        if (skb_queue_empty(&sk->sk_write_queue)) {
+        exthdrlen = !skb ? rt->dst.header_len : 0;
-                /*
+        mtu = cork->fragsize;
-                 * setup for corking.
-                 */
-                opt = ipc->opt;
-                if (opt) {
-                        if (inet->cork.opt == NULL) {
-                                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
-                                if (unlikely(inet->cork.opt == NULL))
-                                        return -ENOBUFS;
-                        }
-                        memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
-                        inet->cork.flags |= IPCORK_OPT;
-                        inet->cork.addr = ipc->addr;
-                }
-                rt = *rtp;
-                if (unlikely(!rt))
-                        return -EFAULT;
-                /*
-                 * We steal reference to this route, caller should not release it
-                 */
-                *rtp = NULL;
-                inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-                                            rt->dst.dev->mtu :
-                                            dst_mtu(rt->dst.path);
-                inet->cork.dst = &rt->dst;
-                inet->cork.length = 0;
-                sk->sk_sndmsg_page = NULL;
-                sk->sk_sndmsg_off = 0;
-                if ((exthdrlen = rt->dst.header_len) != 0) {
-                        length += exthdrlen;
-                        transhdrlen += exthdrlen;
-                }
-        } else {
-                rt = (struct rtable *)inet->cork.dst;
-                if (inet->cork.flags & IPCORK_OPT)
-                        opt = inet->cork.opt;
-                transhdrlen = 0;
-                exthdrlen = 0;
-                mtu = inet->cork.fragsize;
-        }
        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+        if (cork->length + length > 0xFFFF - fragheaderlen) {
-                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
+                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
                               mtu-exthdrlen);
                return -EMSGSIZE;
        }
@@ -878,15 +825,13 @@ int ip_append_data(struct sock *sk,
            !exthdrlen)
                csummode = CHECKSUM_PARTIAL;
-        skb = skb_peek_tail(&sk->sk_write_queue);
+        cork->length += length;
-        inet->cork.length += length;
        if (((length > mtu) || (skb && skb_is_gso(skb))) &&
            (sk->sk_protocol == IPPROTO_UDP) &&
-            (rt->dst.dev->features & NETIF_F_UFO)) {
+            (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
-                err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
+                err = ip_ufo_append_data(sk, queue, getfrag, from, length,
-                                         fragheaderlen, transhdrlen, mtu,
+                                         hh_len, fragheaderlen, transhdrlen,
-                                         flags);
+                                         mtu, flags);
                if (err)
                        goto error;
                return 0;
@@ -934,7 +879,9 @@ alloc_new_skb:
                            !(rt->dst.dev->features&NETIF_F_SG))
                                alloclen = mtu;
                        else
-                                alloclen = datalen + fragheaderlen;
+                                alloclen = fraglen;
+                        alloclen += exthdrlen;
                        /* The last fragment gets additional space at tail.
                         * Note, with MSG_MORE we overallocate on fragments,
@@ -960,7 +907,7 @@ alloc_new_skb:
                                else
                                        /* only the initial fragment is
                                           time stamped */
-                                        ipc->shtx.flags = 0;
+                                        cork->tx_flags = 0;
                        }
                        if (skb == NULL)
                                goto error;
@@ -971,16 +918,16 @@ alloc_new_skb:
                        skb->ip_summed = csummode;
                        skb->csum = 0;
                        skb_reserve(skb, hh_len);
-                        *skb_tx(skb) = ipc->shtx;
+                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
                        /*
                         *      Find where to start putting bytes.
                         */
-                        data = skb_put(skb, fraglen);
+                        data = skb_put(skb, fraglen + exthdrlen);
                        skb_set_network_header(skb, exthdrlen);
                        skb->transport_header = (skb->network_header +
                                                 fragheaderlen);
-                        data += fragheaderlen;
+                        data += fragheaderlen + exthdrlen;
                        if (fraggap) {
                                skb->csum = skb_copy_and_csum_bits(
@@ -1008,7 +955,7 @@ alloc_new_skb:
                        /*
                         * Put the packet on the pending queue.
                         */
-                        __skb_queue_tail(&sk->sk_write_queue, skb);
+                        __skb_queue_tail(queue, skb);
                        continue;
                }
@@ -1028,8 +975,8 @@ alloc_new_skb:
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-                        struct page *page = sk->sk_sndmsg_page;
+                        struct page *page = cork->page;
-                        int off = sk->sk_sndmsg_off;
+                        int off = cork->off;
                        unsigned int left;
                        if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1041,7 +988,7 @@ alloc_new_skb:
                                                goto error;
                                        }
                                        get_page(page);
-                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+                                        skb_fill_page_desc(skb, i, page, off, 0);
                                        frag = &skb_shinfo(skb)->frags[i];
                                }
                        } else if (i < MAX_SKB_FRAGS) {
@@ -1052,8 +999,8 @@ alloc_new_skb:
                                        err = -ENOMEM;
                                        goto error;
                                }
-                                sk->sk_sndmsg_page = page;
+                                cork->page = page;
-                                sk->sk_sndmsg_off = 0;
+                                cork->off = 0;
                                skb_fill_page_desc(skb, i, page, 0, 0);
                                frag = &skb_shinfo(skb)->frags[i];
@@ -1065,7 +1012,7 @@ alloc_new_skb:
                                err = -EFAULT;
                                goto error;
                        }
-                        sk->sk_sndmsg_off += copy;
+                        cork->off += copy;
                        frag->size += copy;
                        skb->len += copy;
                        skb->data_len += copy;
@@ -1079,18 +1026,95 @@ alloc_new_skb:
        return 0;
 error:
-        inet->cork.length -= length;
+        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
        return err;
 }
-ssize_t ip_append_page(struct sock *sk, struct page *page,
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+                         struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        struct ip_options_rcu *opt;
+        struct rtable *rt;
+        /*
+         * setup for corking.
+         */
+        opt = ipc->opt;
+        if (opt) {
+                if (cork->opt == NULL) {
+                        cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+                                            sk->sk_allocation);
+                        if (unlikely(cork->opt == NULL))
+                                return -ENOBUFS;
+                }
+                memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+                cork->flags |= IPCORK_OPT;
+                cork->addr = ipc->addr;
+        }
+        rt = *rtp;
+        if (unlikely(!rt))
+                return -EFAULT;
+        /*
+         * We steal reference to this route, caller should not release it
+         */
+        *rtp = NULL;
+        cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+                         rt->dst.dev->mtu : dst_mtu(&rt->dst);
+        cork->dst = &rt->dst;
+        cork->length = 0;
+        cork->tx_flags = ipc->tx_flags;
+        cork->page = NULL;
+        cork->off = 0;
+        return 0;
+}
+/*
+ *      ip_append_data() and ip_append_page() can make one large IP datagram
+ *      from many pieces of data. Each pieces will be holded on the socket
+ *      until ip_push_pending_frames() is called. Each piece can be a page
+ *      or non-page data.
+ *
+ *      Not only UDP, other transport protocols - e.g. raw sockets - can use
+ *      this interface potentially.
+ *
+ *      LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+                   int getfrag(void *from, char *to, int offset, int len,
+                               int odd, struct sk_buff *skb),
+                   void *from, int length, int transhdrlen,
+                   struct ipcm_cookie *ipc, struct rtable **rtp,
+                   unsigned int flags)
+{
+        struct inet_sock *inet = inet_sk(sk);
+        int err;
+        if (flags&MSG_PROBE)
+                return 0;
+        if (skb_queue_empty(&sk->sk_write_queue)) {
+                err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+                if (err)
+                        return err;
+        } else {
+                transhdrlen = 0;
+        }
+        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
+                                from, length, transhdrlen, flags);
+}
+ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
                       int offset, size_t size, int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
        struct sk_buff *skb;
        struct rtable *rt;
        struct ip_options *opt = NULL;
+        struct inet_cork *cork;
        int hh_len;
        int mtu;
        int len;
@@ -1106,28 +1130,29 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
        if (skb_queue_empty(&sk->sk_write_queue))
                return -EINVAL;
-        rt = (struct rtable *)inet->cork.dst;
+        cork = &inet->cork.base;
-        if (inet->cork.flags & IPCORK_OPT)
+        rt = (struct rtable *)cork->dst;
-                opt = inet->cork.opt;
+        if (cork->flags & IPCORK_OPT)
+                opt = cork->opt;
        if (!(rt->dst.dev->features&NETIF_F_SG))
                return -EOPNOTSUPP;
        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
-        mtu = inet->cork.fragsize;
+        mtu = cork->fragsize;
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
+        if (cork->length + size > 0xFFFF - fragheaderlen) {
-                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
+                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
                return -EMSGSIZE;
        }
        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                return -EINVAL;
-        inet->cork.length += size;
+        cork->length += size;
        if ((size + skb->len > mtu) &&
            (sk->sk_protocol == IPPROTO_UDP) &&
            (rt->dst.dev->features & NETIF_F_UFO)) {
@@ -1222,45 +1247,47 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
        return 0;
 error:
-        inet->cork.length -= size;
+        cork->length -= size;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
        return err;
 }
-static void ip_cork_release(struct inet_sock *inet)
+static void ip_cork_release(struct inet_cork *cork)
 {
-        inet->cork.flags &= ~IPCORK_OPT;
+        cork->flags &= ~IPCORK_OPT;
-        kfree(inet->cork.opt);
+        kfree(cork->opt);
-        inet->cork.opt = NULL;
+        cork->opt = NULL;
-        dst_release(inet->cork.dst);
+        dst_release(cork->dst);
-        inet->cork.dst = NULL;
+        cork->dst = NULL;
 }
 /*
 *      Combined all pending IP fragments on the socket as one IP datagram
 *      and push them out.
 */
-int ip_push_pending_frames(struct sock *sk)
+struct sk_buff *__ip_make_skb(struct sock *sk,
+                              struct flowi4 *fl4,
+                              struct sk_buff_head *queue,
+                              struct inet_cork *cork)
 {
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct ip_options *opt = NULL;
-        struct rtable *rt = (struct rtable *)inet->cork.dst;
+        struct rtable *rt = (struct rtable *)cork->dst;
        struct iphdr *iph;
        __be16 df = 0;
        __u8 ttl;
-        int err = 0;
-        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+        if ((skb = __skb_dequeue(queue)) == NULL)
                goto out;
        tail_skb = &(skb_shinfo(skb)->frag_list);
        /* move skb->data to ip header from ext header */
        if (skb->data < skb_network_header(skb))
                __skb_pull(skb, skb_network_offset(skb));
-        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
                __skb_pull(tmp_skb, skb_network_header_len(skb));
                *tail_skb = tmp_skb;
                tail_skb = &(tmp_skb->next);
@@ -1286,8 +1313,8 @@ int ip_push_pending_frames(struct sock *sk)
             ip_dont_fragment(sk, &rt->dst)))
                df = htons(IP_DF);
-        if (inet->cork.flags & IPCORK_OPT)
+        if (cork->flags & IPCORK_OPT)
-                opt = inet->cork.opt;
+                opt = cork->opt;
        if (rt->rt_type == RTN_MULTICAST)
                ttl = inet->mc_ttl;
@@ -1297,17 +1324,18 @@ int ip_push_pending_frames(struct sock *sk)
        iph = (struct iphdr *)skb->data;
        iph->version = 4;
        iph->ihl = 5;
-        if (opt) {
-                iph->ihl += opt->optlen>>2;
-                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
-        }
        iph->tos = inet->tos;
        iph->frag_off = df;
        ip_select_ident(iph, &rt->dst, sk);
        iph->ttl = ttl;
        iph->protocol = sk->sk_protocol;
-        iph->saddr = rt->rt_src;
+        iph->saddr = fl4->saddr;
-        iph->daddr = rt->rt_dst;
+        iph->daddr = fl4->daddr;
+        if (opt) {
+                iph->ihl += opt->optlen>>2;
+                ip_options_build(skb, opt, cork->addr, rt, 0);
+        }
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
@@ -1315,44 +1343,99 @@ int ip_push_pending_frames(struct sock *sk)
         * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
         * on dst refcount
         */
-        inet->cork.dst = NULL;
+        cork->dst = NULL;
        skb_dst_set(skb, &rt->dst);
        if (iph->protocol == IPPROTO_ICMP)
                icmp_out_count(net, ((struct icmphdr *)
                        skb_transport_header(skb))->type);
-        /* Netfilter gets whole the not fragmented skb. */
+        ip_cork_release(cork);
+out:
+        return skb;
+}
+int ip_send_skb(struct sk_buff *skb)
+{
+        struct net *net = sock_net(skb->sk);
+        int err;
        err = ip_local_out(skb);
        if (err) {
                if (err > 0)
                        err = net_xmit_errno(err);
                if (err)
-                        goto error;
+                        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
        }
-out:
-        ip_cork_release(inet);
        return err;
+}
-error:
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
-        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+{
-        goto out;
+        struct sk_buff *skb;
+        skb = ip_finish_skb(sk, fl4);
+        if (!skb)
+                return 0;
+        /* Netfilter gets whole the not fragmented skb. */
+        return ip_send_skb(skb);
 }
 /*
 *      Throw away all pending data on the socket.
 */
-void ip_flush_pending_frames(struct sock *sk)
+static void __ip_flush_pending_frames(struct sock *sk,
+                                      struct sk_buff_head *queue,
+                                      struct inet_cork *cork)
 {
        struct sk_buff *skb;
-        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+        while ((skb = __skb_dequeue_tail(queue)) != NULL)
                kfree_skb(skb);
-        ip_cork_release(inet_sk(sk));
+        ip_cork_release(cork);
 }
+void ip_flush_pending_frames(struct sock *sk)
+{
+        __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+struct sk_buff *ip_make_skb(struct sock *sk,
+                            struct flowi4 *fl4,
+                            int getfrag(void *from, char *to, int offset,
+                                        int len, int odd, struct sk_buff *skb),
+                            void *from, int length, int transhdrlen,
+                            struct ipcm_cookie *ipc, struct rtable **rtp,
+                            unsigned int flags)
+{
+        struct inet_cork cork;
+        struct sk_buff_head queue;
+        int err;
+        if (flags & MSG_PROBE)
+                return NULL;
+        __skb_queue_head_init(&queue);
+        cork.flags = 0;
+        cork.addr = 0;
+        cork.opt = NULL;
+        err = ip_setup_cork(sk, &cork, ipc, rtp);
+        if (err)
+                return ERR_PTR(err);
+        err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
+                               from, length, transhdrlen, flags);
+        if (err) {
+                __ip_flush_pending_frames(sk, &queue, &cork);
+                return ERR_PTR(err);
+        }
+        return __ip_make_skb(sk, fl4, &queue, &cork);
+}
 /*
 *      Fetch data from kernel space and fill in checksum if needed.
@@ -1374,48 +1457,39 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 *      Should run single threaded per socket because it uses the sock
 *      structure to pass arguments.
 */
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
-                   unsigned int len)
+                   struct ip_reply_arg *arg, unsigned int len)
 {
        struct inet_sock *inet = inet_sk(sk);
-        struct {
+        struct ip_options_data replyopts;
-                struct ip_options       opt;
-                char                    data[40];
-        } replyopts;
        struct ipcm_cookie ipc;
-        __be32 daddr;
+        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
-        if (ip_options_echo(&replyopts.opt, skb))
+        if (ip_options_echo(&replyopts.opt.opt, skb))
                return;
-        daddr = ipc.addr = rt->rt_src;
+        ipc.addr = daddr;
        ipc.opt = NULL;
-        ipc.shtx.flags = 0;
+        ipc.tx_flags = 0;
-        if (replyopts.opt.optlen) {
+        if (replyopts.opt.opt.optlen) {
                ipc.opt = &replyopts.opt;
-                if (ipc.opt->srr)
+                if (replyopts.opt.opt.srr)
-                        daddr = replyopts.opt.faddr;
+                        daddr = replyopts.opt.opt.faddr;
        }
-        {
+        flowi4_init_output(&fl4, arg->bound_dev_if, 0,
-                struct flowi fl = { .oif = arg->bound_dev_if,
+                           RT_TOS(ip_hdr(skb)->tos),
-                                    .nl_u = { .ip4_u =
+                           RT_SCOPE_UNIVERSE, sk->sk_protocol,
-                                              { .daddr = daddr,
+                           ip_reply_arg_flowi_flags(arg),
-                                                .saddr = rt->rt_spec_dst,
+                           daddr, rt->rt_spec_dst,
-                                                .tos = RT_TOS(ip_hdr(skb)->tos) } },
+                           tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
-                                    /* Not quite clean, but right. */
+        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-                                    .uli_u = { .ports =
+        rt = ip_route_output_key(sock_net(sk), &fl4);
-                                               { .sport = tcp_hdr(skb)->dest,
+        if (IS_ERR(rt))
-                                                 .dport = tcp_hdr(skb)->source } },
+                return;
-                                    .proto = sk->sk_protocol,
-                                    .flags = ip_reply_arg_flowi_flags(arg) };
-                security_skb_classify_flow(skb, &fl);
-                if (ip_route_output_key(sock_net(sk), &rt, &fl))
-                        return;
-        }
        /* And let IP do all the hard work.
@@ -1428,7 +1502,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
-        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+        ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
                       &ipc, &rt, MSG_DONTWAIT);
        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
                if (arg->csumoffset >= 0)
@@ -1436,7 +1510,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
                                                                arg->csum));
                skb->ip_summed = CHECKSUM_NONE;
-                ip_push_pending_frames(sk);
+                ip_push_pending_frames(sk, &fl4);
        }
        bh_unlock_sock(sk);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64b70ad162e3..ab0c9efd1efa 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -131,7 +131,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
 static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 {
        struct sockaddr_in sin;
-        struct iphdr *iph = ip_hdr(skb);
+        const struct iphdr *iph = ip_hdr(skb);
        __be16 *ports = (__be16 *)skb_transport_header(skb);
        if (skb_transport_offset(skb) + 4 > skb->len)
@@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
   but receiver should be enough clever f.e. to forward mtrace requests,
   sent to multicast group to reach destination designated router.
 */
-struct ip_ra_chain *ip_ra_chain;
+struct ip_ra_chain __rcu *ip_ra_chain;
 static DEFINE_SPINLOCK(ip_ra_lock);
@@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head)
 int ip_ra_control(struct sock *sk, unsigned char on,
                  void (*destructor)(struct sock *))
 {
-        struct ip_ra_chain *ra, *new_ra, **rap;
+        struct ip_ra_chain *ra, *new_ra;
+        struct ip_ra_chain __rcu **rap;
        if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
                return -EINVAL;
@@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on,
        new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
        spin_lock_bh(&ip_ra_lock);
-        for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
+        for (rap = &ip_ra_chain;
+             (ra = rcu_dereference_protected(*rap,
+                        lockdep_is_held(&ip_ra_lock))) != NULL;
+             rap = &ra->next) {
                if (ra->sk == sk) {
                        if (on) {
                                spin_unlock_bh(&ip_ra_lock);
@@ -447,6 +451,11 @@ out:
 }
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+        kfree(container_of(head, struct ip_options_rcu, rcu));
+}
 /*
 *      Socket option code for IP. This is the end of the line after any
 *      TCP,UDP etc options on an IP socket.
@@ -493,13 +502,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
        switch (optname) {
        case IP_OPTIONS:
        {
-                struct ip_options *opt = NULL;
+                struct ip_options_rcu *old, *opt = NULL;
                if (optlen > 40)
                        goto e_inval;
                err = ip_options_get_from_user(sock_net(sk), &opt,
                                               optval, optlen);
                if (err)
                        break;
+                old = rcu_dereference_protected(inet->inet_opt,
+                                                sock_owned_by_user(sk));
                if (inet->is_icsk) {
                        struct inet_connection_sock *icsk = inet_csk(sk);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -508,17 +520,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                               (TCPF_LISTEN | TCPF_CLOSE)) &&
                             inet->inet_daddr != LOOPBACK4_IPV6)) {
 #endif
-                                if (inet->opt)
+                                if (old)
-                                        icsk->icsk_ext_hdr_len -= inet->opt->optlen;
+                                        icsk->icsk_ext_hdr_len -= old->opt.optlen;
                                if (opt)
-                                        icsk->icsk_ext_hdr_len += opt->optlen;
+                                        icsk->icsk_ext_hdr_len += opt->opt.optlen;
                                icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                        }
 #endif
                }
-                opt = xchg(&inet->opt, opt);
+                rcu_assign_pointer(inet->inet_opt, opt);
-                kfree(opt);
+                if (old)
+                        call_rcu(&old->rcu, opt_kfree_rcu);
                break;
        }
        case IP_PKTINFO:
@@ -1077,12 +1090,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
        case IP_OPTIONS:
        {
                unsigned char optbuf[sizeof(struct ip_options)+40];
-                struct ip_options * opt = (struct ip_options *)optbuf;
+                struct ip_options *opt = (struct ip_options *)optbuf;
+                struct ip_options_rcu *inet_opt;
+                inet_opt = rcu_dereference_protected(inet->inet_opt,
+                                                     sock_owned_by_user(sk));
                opt->optlen = 0;
-                if (inet->opt)
+                if (inet_opt)
-                        memcpy(optbuf, inet->opt,
+                        memcpy(optbuf, &inet_opt->opt,
-                               sizeof(struct ip_options)+
+                               sizeof(struct ip_options) +
-                               inet->opt->optlen);
+                               inet_opt->opt.optlen);
                release_sock(sk);
                if (opt->optlen == 0)
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 629067571f02..c857f6f49b03 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -27,7 +27,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
 {
        struct net *net = dev_net(skb->dev);
        __be32 spi;
-        struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
@@ -36,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
                return;
        spi = htonl(ntohs(ipch->cpi));
-        x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr,
+        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET);
        if (!x)
                return;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 3a6e1ec5e9ae..ab7e5542c1cf 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -87,8 +87,8 @@
 #endif
 /* Define the friendly delay before and after opening net devices */
-#define CONF_PRE_OPEN           500     /* Before opening: 1/2 second */
+#define CONF_POST_OPEN          10      /* After opening: 10 msecs */
-#define CONF_POST_OPEN          1       /* After opening: 1 second */
+#define CONF_CARRIER_TIMEOUT    120000  /* Wait for carrier timeout */
 /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
 #define CONF_OPEN_RETRIES       2       /* (Re)open devices twice */
@@ -188,14 +188,14 @@ struct ic_device {
 static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
 static struct net_device *ic_dev __initdata = NULL;     /* Selected device */
-static bool __init ic_device_match(struct net_device *dev)
+static bool __init ic_is_init_dev(struct net_device *dev)
 {
-        if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+        if (dev->flags & IFF_LOOPBACK)
+                return false;
+        return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
            (!(dev->flags & IFF_LOOPBACK) &&
             (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
-             strncmp(dev->name, "dummy", 5)))
+             strncmp(dev->name, "dummy", 5));
-                return true;
-        return false;
 }
 static int __init ic_open_devs(void)
@@ -203,6 +203,7 @@ static int __init ic_open_devs(void)
        struct ic_device *d, **last;
        struct net_device *dev;
        unsigned short oflags;
+        unsigned long start;
        last = &ic_first_dev;
        rtnl_lock();
@@ -216,9 +217,7 @@ static int __init ic_open_devs(void)
        }
        for_each_netdev(&init_net, dev) {
-                if (dev->flags & IFF_LOOPBACK)
+                if (ic_is_init_dev(dev)) {
-                        continue;
-                if (ic_device_match(dev)) {
                        int able = 0;
                        if (dev->mtu >= 364)
                                able |= IC_BOOTP;
@@ -252,6 +251,17 @@ static int __init ic_open_devs(void)
                                dev->name, able, d->xid));
                }
        }
+        /* wait for a carrier on at least one device */
+        start = jiffies;
+        while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
+                for_each_netdev(&init_net, dev)
+                        if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+                                goto have_carrier;
+                msleep(1);
+        }
+have_carrier:
        rtnl_unlock();
        *last = NULL;
@@ -1191,13 +1201,13 @@ static int __init ic_dynamic(void)
                    (ic_proto_enabled & IC_USE_DHCP) &&
                    ic_dhcp_msgtype != DHCPACK) {
                        ic_got_reply = 0;
-                        printk(",");
+                        printk(KERN_CONT ",");
                        continue;
                }
 #endif /* IPCONFIG_DHCP */
                if (ic_got_reply) {
-                        printk(" OK\n");
+                        printk(KERN_CONT " OK\n");
                        break;
                }
@@ -1205,7 +1215,7 @@ static int __init ic_dynamic(void)
                        continue;
                if (! --retries) {
-                        printk(" timed out!\n");
+                        printk(KERN_CONT " timed out!\n");
                        break;
                }
@@ -1215,7 +1225,7 @@ static int __init ic_dynamic(void)
                if (timeout > CONF_TIMEOUT_MAX)
                        timeout = CONF_TIMEOUT_MAX;
-                printk(".");
+                printk(KERN_CONT ".");
        }
 #ifdef IPCONFIG_BOOTP
@@ -1236,7 +1246,7 @@ static int __init ic_dynamic(void)
                ((ic_got_reply & IC_RARP) ? "RARP"
                 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
                &ic_servaddr);
-        printk("my address is %pI4\n", &ic_myaddr);
+        printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
        return 0;
 }
@@ -1324,14 +1334,13 @@ static int __init wait_for_devices(void)
 {
        int i;
-        msleep(CONF_PRE_OPEN);
        for (i = 0; i < DEVICE_WAIT_MAX; i++) {
                struct net_device *dev;
                int found = 0;
                rtnl_lock();
                for_each_netdev(&init_net, dev) {
-                        if (ic_device_match(dev)) {
+                        if (ic_is_init_dev(dev)) {
                                found = 1;
                                break;
                        }
@@ -1378,7 +1387,7 @@ static int __init ip_auto_config(void)
                return err;
        /* Give drivers a chance to settle */
-        ssleep(CONF_POST_OPEN);
+        msleep(CONF_POST_OPEN);
        /*
         * If the config information is insufficient (e.g., our IP address or
@@ -1444,7 +1453,7 @@ static int __init ip_auto_config(void)
                root_server_addr = addr;
        /*
-         * Use defaults whereever applicable.
+         * Use defaults wherever applicable.
         */
        if (ic_defaults() < 0)
                return -1;
@@ -1468,19 +1477,19 @@ static int __init ip_auto_config(void)
        /*
         * Clue in the operator.
         */
-        printk("IP-Config: Complete:");
+        printk("IP-Config: Complete:\n");
-        printk("\n     device=%s", ic_dev->name);
+        printk("     device=%s", ic_dev->name);
-        printk(", addr=%pI4", &ic_myaddr);
+        printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
-        printk(", mask=%pI4", &ic_netmask);
+        printk(KERN_CONT ", mask=%pI4", &ic_netmask);
-        printk(", gw=%pI4", &ic_gateway);
+        printk(KERN_CONT ", gw=%pI4", &ic_gateway);
-        printk(",\n     host=%s, domain=%s, nis-domain=%s",
+        printk(KERN_CONT ",\n     host=%s, domain=%s, nis-domain=%s",
               utsname()->nodename, ic_domain, utsname()->domainname);
-        printk(",\n     bootserver=%pI4", &ic_servaddr);
+        printk(KERN_CONT ",\n     bootserver=%pI4", &ic_servaddr);
-        printk(", rootserver=%pI4", &root_server_addr);
+        printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
-        printk(", rootpath=%s", root_server_path);
+        printk(KERN_CONT ", rootpath=%s", root_server_path);
        if (ic_dev_mtu)
-                printk(", mtu=%d", ic_dev_mtu);
+                printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
-        printk("\n");
+        printk(KERN_CONT "\n");
 #endif /* !SILENT */
        return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70b..378b20b7ca6e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
 static int ipip_net_id __read_mostly;
 struct ipip_net {
-        struct ip_tunnel *tunnels_r_l[HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
-        struct ip_tunnel *tunnels_r[HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
-        struct ip_tunnel *tunnels_l[HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
-        struct ip_tunnel *tunnels_wc[1];
+        struct ip_tunnel __rcu *tunnels_wc[1];
-        struct ip_tunnel **tunnels[4];
+        struct ip_tunnel __rcu **tunnels[4];
        struct net_device *fb_tunnel_dev;
 };
-static void ipip_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
 static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
 */
-static DEFINE_SPINLOCK(ipip_lock);
 #define for_each_ip_tunnel_rcu(start) \
        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+        unsigned long   rx_packets;
+        unsigned long   rx_bytes;
+        unsigned long   tx_packets;
+        unsigned long   tx_bytes;
+};
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+        struct pcpu_tstats sum = { 0 };
+        int i;
+        for_each_possible_cpu(i) {
+                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+                sum.rx_packets += tstats->rx_packets;
+                sum.rx_bytes   += tstats->rx_bytes;
+                sum.tx_packets += tstats->tx_packets;
+                sum.tx_bytes   += tstats->tx_bytes;
+        }
+        dev->stats.rx_packets = sum.rx_packets;
+        dev->stats.rx_bytes   = sum.rx_bytes;
+        dev->stats.tx_packets = sum.tx_packets;
+        dev->stats.tx_bytes   = sum.tx_bytes;
+        return &dev->stats;
+}
 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
                __be32 remote, __be32 local)
 {
-        unsigned h0 = HASH(remote);
+        unsigned int h0 = HASH(remote);
-        unsigned h1 = HASH(local);
+        unsigned int h1 = HASH(local);
        struct ip_tunnel *t;
        struct ipip_net *ipn = net_generic(net, ipip_net_id);
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
        return NULL;
 }
-static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
                struct ip_tunnel_parm *parms)
 {
        __be32 remote = parms->iph.daddr;
        __be32 local = parms->iph.saddr;
-        unsigned h = 0;
+        unsigned int h = 0;
        int prio = 0;
        if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
        return &ipn->tunnels[prio][h];
 }
-static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
                struct ip_tunnel *t)
 {
        return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
 {
-        struct ip_tunnel **tp;
+        struct ip_tunnel __rcu **tp;
+        struct ip_tunnel *iter;
-        for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
-                if (t == *tp) {
+        for (tp = ipip_bucket(ipn, t);
-                        spin_lock_bh(&ipip_lock);
+             (iter = rtnl_dereference(*tp)) != NULL;
-                        *tp = t->next;
+             tp = &iter->next) {
-                        spin_unlock_bh(&ipip_lock);
+                if (t == iter) {
+                        rcu_assign_pointer(*tp, t->next);
                        break;
                }
        }
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
 {
-        struct ip_tunnel **tp = ipip_bucket(ipn, t);
+        struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
-        spin_lock_bh(&ipip_lock);
+        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
-        t->next = *tp;
        rcu_assign_pointer(*tp, t);
-        spin_unlock_bh(&ipip_lock);
 }
 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 {
        __be32 remote = parms->iph.daddr;
        __be32 local = parms->iph.saddr;
-        struct ip_tunnel *t, **tp, *nt;
+        struct ip_tunnel *t, *nt;
+        struct ip_tunnel __rcu **tp;
        struct net_device *dev;
        char name[IFNAMSIZ];
        struct ipip_net *ipn = net_generic(net, ipip_net_id);
-        for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
+        for (tp = __ipip_bucket(ipn, parms);
+                 (t = rtnl_dereference(*tp)) != NULL;
+                 tp = &t->next) {
                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
                        return t;
        }
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
        if (parms->name[0])
                strlcpy(name, parms->name, IFNAMSIZ);
        else
-                sprintf(name, "tunl%%d");
+                strcpy(name, "tunl%d");
        dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
        if (dev == NULL)
@@ -246,15 +276,11 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
        dev_net_set(dev, net);
-        if (strchr(name, '%')) {
-                if (dev_alloc_name(dev, name) < 0)
-                        goto failed_free;
-        }
        nt = netdev_priv(dev);
        nt->parms = *parms;
-        ipip_tunnel_init(dev);
+        if (ipip_tunnel_init(dev) < 0)
+                goto failed_free;
        if (register_netdevice(dev) < 0)
                goto failed_free;
@@ -264,20 +290,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
        return nt;
 failed_free:
-        free_netdev(dev);
+        ipip_dev_free(dev);
        return NULL;
 }
+/* called with RTNL */
 static void ipip_tunnel_uninit(struct net_device *dev)
 {
        struct net *net = dev_net(dev);
        struct ipip_net *ipn = net_generic(net, ipip_net_id);
-        if (dev == ipn->fb_tunnel_dev) {
+        if (dev == ipn->fb_tunnel_dev)
-                spin_lock_bh(&ipip_lock);
+                rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
-                ipn->tunnels_wc[0] = NULL;
+        else
-                spin_unlock_bh(&ipip_lock);
-        } else
                ipip_tunnel_unlink(ipn, netdev_priv(dev));
        dev_put(dev);
 }
@@ -289,7 +314,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
   8 bytes of packet payload. It means, that precise relaying of
   ICMP in the real Internet is absolutely infeasible.
 */
-        struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct ip_tunnel *t;
@@ -359,8 +384,10 @@ static int ipip_rcv(struct sk_buff *skb)
        const struct iphdr *iph = ip_hdr(skb);
        rcu_read_lock();
-        if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
+        tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
-                                        iph->saddr, iph->daddr)) != NULL) {
+        if (tunnel != NULL) {
+                struct pcpu_tstats *tstats;
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                        rcu_read_unlock();
                        kfree_skb(skb);
@@ -374,10 +401,16 @@ static int ipip_rcv(struct sk_buff *skb)
                skb->protocol = htons(ETH_P_IP);
                skb->pkt_type = PACKET_HOST;
-                skb_tunnel_rx(skb, tunnel->dev);
+                tstats = this_cpu_ptr(tunnel->dev->tstats);
+                tstats->rx_packets++;
+                tstats->rx_bytes += skb->len;
+                __skb_tunnel_rx(skb, tunnel->dev);
                ipip_ecn_decapsulate(iph, skb);
                netif_rx(skb);
                rcu_read_unlock();
                return 0;
        }
@@ -394,52 +427,49 @@ static int ipip_rcv(struct sk_buff *skb)
 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct net_device_stats *stats = &dev->stats;
+        struct pcpu_tstats *tstats;
-        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+        const struct iphdr  *tiph = &tunnel->parms.iph;
-        struct iphdr  *tiph = &tunnel->parms.iph;
        u8     tos = tunnel->parms.iph.tos;
        __be16 df = tiph->frag_off;
        struct rtable *rt;                      /* Route to the other host */
-        struct net_device *tdev;                        /* Device to other host */
+        struct net_device *tdev;                /* Device to other host */
-        struct iphdr  *old_iph = ip_hdr(skb);
+        const struct iphdr  *old_iph = ip_hdr(skb);
        struct iphdr  *iph;                     /* Our new IP header */
        unsigned int max_headroom;              /* The extra header space needed */
        __be32 dst = tiph->daddr;
+        struct flowi4 fl4;
        int    mtu;
        if (skb->protocol != htons(ETH_P_IP))
                goto tx_error;
-        if (tos&1)
+        if (tos & 1)
                tos = old_iph->tos;
        if (!dst) {
                /* NBMA tunnel */
                if ((rt = skb_rtable(skb)) == NULL) {
-                        stats->tx_fifo_errors++;
+                        dev->stats.tx_fifo_errors++;
                        goto tx_error;
                }
                if ((dst = rt->rt_gateway) == 0)
                        goto tx_error_icmp;
        }
-        {
+        rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
-                struct flowi fl = { .oif = tunnel->parms.link,
+                                   dst, tiph->saddr,
-                                    .nl_u = { .ip4_u =
+                                   0, 0,
-                                              { .daddr = dst,
+                                   IPPROTO_IPIP, RT_TOS(tos),
-                                                .saddr = tiph->saddr,
+                                   tunnel->parms.link);
-                                                .tos = RT_TOS(tos) } },
+        if (IS_ERR(rt)) {
-                                    .proto = IPPROTO_IPIP };
+                dev->stats.tx_carrier_errors++;
-                if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
+                goto tx_error_icmp;
-                        stats->tx_carrier_errors++;
-                        goto tx_error_icmp;
-                }
        }
        tdev = rt->dst.dev;
        if (tdev == dev) {
                ip_rt_put(rt);
-                stats->collisions++;
+                dev->stats.collisions++;
                goto tx_error;
        }
@@ -449,7 +479,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
                if (mtu < 68) {
-                        stats->collisions++;
+                        dev->stats.collisions++;
                        ip_rt_put(rt);
                        goto tx_error;
                }
@@ -485,7 +515,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
                if (!new_skb) {
                        ip_rt_put(rt);
-                        txq->tx_dropped++;
+                        dev->stats.tx_dropped++;
                        dev_kfree_skb(skb);
                        return NETDEV_TX_OK;
                }
@@ -515,21 +545,21 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
        iph->frag_off           =       df;
        iph->protocol           =       IPPROTO_IPIP;
        iph->tos                =       INET_ECN_encapsulate(tos, old_iph->tos);
-        iph->daddr              =       rt->rt_dst;
+        iph->daddr              =       fl4.daddr;
-        iph->saddr              =       rt->rt_src;
+        iph->saddr              =       fl4.saddr;
        if ((iph->ttl = tiph->ttl) == 0)
                iph->ttl        =       old_iph->ttl;
        nf_reset(skb);
+        tstats = this_cpu_ptr(dev->tstats);
-        IPTUNNEL_XMIT();
+        __IPTUNNEL_XMIT(tstats, &dev->stats);
        return NETDEV_TX_OK;
 tx_error_icmp:
        dst_link_failure(skb);
 tx_error:
-        stats->tx_errors++;
+        dev->stats.tx_errors++;
        dev_kfree_skb(skb);
        return NETDEV_TX_OK;
 }
@@ -538,20 +568,22 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
 {
        struct net_device *tdev = NULL;
        struct ip_tunnel *tunnel;
-        struct iphdr *iph;
+        const struct iphdr *iph;
        tunnel = netdev_priv(dev);
        iph = &tunnel->parms.iph;
        if (iph->daddr) {
-                struct flowi fl = { .oif = tunnel->parms.link,
-                                    .nl_u = { .ip4_u =
-                                              { .daddr = iph->daddr,
-                                                .saddr = iph->saddr,
-                                                .tos = RT_TOS(iph->tos) } },
-                                    .proto = IPPROTO_IPIP };
                struct rtable *rt;
-                if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
+                struct flowi4 fl4;
+                rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
+                                           iph->daddr, iph->saddr,
+                                           0, 0,
+                                           IPPROTO_IPIP,
+                                           RT_TOS(iph->tos),
+                                           tunnel->parms.link);
+                if (!IS_ERR(rt)) {
                        tdev = rt->dst.dev;
                        ip_rt_put(rt);
                }
@@ -627,6 +659,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
                                }
                                t = netdev_priv(dev);
                                ipip_tunnel_unlink(ipn, t);
+                                synchronize_net();
                                t->parms.iph.saddr = p.iph.saddr;
                                t->parms.iph.daddr = p.iph.daddr;
                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -696,13 +729,19 @@ static const struct net_device_ops ipip_netdev_ops = {
        .ndo_start_xmit = ipip_tunnel_xmit,
        .ndo_do_ioctl   = ipip_tunnel_ioctl,
        .ndo_change_mtu = ipip_tunnel_change_mtu,
+        .ndo_get_stats  = ipip_get_stats,
 };
+static void ipip_dev_free(struct net_device *dev)
+{
+        free_percpu(dev->tstats);
+        free_netdev(dev);
+}
 static void ipip_tunnel_setup(struct net_device *dev)
 {
        dev->netdev_ops         = &ipip_netdev_ops;
-        dev->destructor         = free_netdev;
+        dev->destructor         = ipip_dev_free;
        dev->type               = ARPHRD_TUNNEL;
        dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +750,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
        dev->iflink             = 0;
        dev->addr_len           = 4;
        dev->features           |= NETIF_F_NETNS_LOCAL;
+        dev->features           |= NETIF_F_LLTX;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
 }
-static void ipip_tunnel_init(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -725,9 +765,15 @@ static void ipip_tunnel_init(struct net_device *dev)
        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
        ipip_tunnel_bind_dev(dev);
+        dev->tstats = alloc_percpu(struct pcpu_tstats);
+        if (!dev->tstats)
+                return -ENOMEM;
+        return 0;
 }
-static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +786,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
        iph->protocol           = IPPROTO_IPIP;
        iph->ihl                = 5;
+        dev->tstats = alloc_percpu(struct pcpu_tstats);
+        if (!dev->tstats)
+                return -ENOMEM;
        dev_hold(dev);
-        ipn->tunnels_wc[0]      = tunnel;
+        rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+        return 0;
 }
-static struct xfrm_tunnel ipip_handler = {
+static struct xfrm_tunnel ipip_handler __read_mostly = {
        .handler        =       ipip_rcv,
        .err_handler    =       ipip_err,
        .priority       =       1,
@@ -760,11 +811,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
        for (prio = 1; prio < 4; prio++) {
                int h;
                for (h = 0; h < HASH_SIZE; h++) {
-                        struct ip_tunnel *t = ipn->tunnels[prio][h];
+                        struct ip_tunnel *t;
+                        t = rtnl_dereference(ipn->tunnels[prio][h]);
                        while (t != NULL) {
                                unregister_netdevice_queue(t->dev, head);
-                                t = t->next;
+                                t = rtnl_dereference(t->next);
                        }
                }
        }
@@ -789,7 +841,9 @@ static int __net_init ipip_init_net(struct net *net)
        }
        dev_net_set(ipn->fb_tunnel_dev, net);
-        ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+        err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+        if (err)
+                goto err_reg_dev;
        if ((err = register_netdev(ipn->fb_tunnel_dev)))
                goto err_reg_dev;
@@ -797,7 +851,7 @@ static int __net_init ipip_init_net(struct net *net)
        return 0;
 err_reg_dev:
-        free_netdev(ipn->fb_tunnel_dev);
+        ipip_dev_free(ipn->fb_tunnel_dev);
 err_alloc_dev:
        /* nothing */
        return err;
@@ -850,3 +904,4 @@ static void __exit ipip_fini(void)
 module_init(ipip_init);
 module_exit(ipip_fini);
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866fc..30a7763c400e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
 #include <linux/notifier.h>
 #include <linux/if_arp.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
 #include <net/ipip.h>
 #include <net/checksum.h>
 #include <net/netlink.h>
@@ -75,7 +76,7 @@ struct mr_table {
        struct net              *net;
 #endif
        u32                     id;
-        struct sock             *mroute_sk;
+        struct sock __rcu       *mroute_sk;
        struct timer_list       ipmr_expire_timer;
        struct list_head        mfc_unres_queue;
        struct list_head        mfc_cache_array[MFC_LINES];
@@ -98,7 +99,7 @@ struct ipmr_result {
 };
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
-   Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
 */
 static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +114,11 @@ static DEFINE_RWLOCK(mrt_lock);
 static DEFINE_SPINLOCK(mfc_unres_lock);
 /* We return to original Alan's scheme. Hash table of resolved
-   entries is changed only in process context and protected
+ * entries is changed only in process context and protected
-   with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
-   with strong spinlock mfc_unres_lock.
+ * with strong spinlock mfc_unres_lock.
+ *
-   In this case data path is free of exclusive locks at all.
+ * In this case data path is free of exclusive locks at all.
 */
 static struct kmem_cache *mrt_cachep __read_mostly;
@@ -147,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
        return NULL;
 }
-static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
                           struct mr_table **mrt)
 {
        struct ipmr_result res;
        struct fib_lookup_arg arg = { .result = &res, };
        int err;
-        err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
+        err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+                               flowi4_to_flowi(flp4), 0, &arg);
        if (err < 0)
                return err;
        *mrt = res.mrt;
@@ -282,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
        return net->ipv4.mrt;
 }
-static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
                           struct mr_table **mrt)
 {
        *mrt = net->ipv4.mrt;
@@ -396,9 +398,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
                        set_fs(KERNEL_DS);
                        err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
                        set_fs(oldfs);
-                } else
+                } else {
                        err = -EOPNOTSUPP;
+                }
                dev = NULL;
                if (err == 0 &&
@@ -434,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct net *net = dev_net(dev);
        struct mr_table *mrt;
-        struct flowi fl = {
+        struct flowi4 fl4 = {
-                .oif            = dev->ifindex,
+                .flowi4_oif     = dev->ifindex,
-                .iif            = skb->skb_iif,
+                .flowi4_iif     = skb->skb_iif,
-                .mark           = skb->mark,
+                .flowi4_mark    = skb->mark,
        };
        int err;
-        err = ipmr_fib_lookup(net, &fl, &mrt);
+        err = ipmr_fib_lookup(net, &fl4, &mrt);
        if (err < 0) {
                kfree_skb(skb);
                return err;
@@ -495,7 +497,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
        dev->iflink = 0;
        rcu_read_lock();
-        if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+        in_dev = __in_dev_get_rcu(dev);
+        if (!in_dev) {
                rcu_read_unlock();
                goto failure;
        }
@@ -552,9 +555,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
                mrt->mroute_reg_vif_num = -1;
 #endif
-        if (vifi+1 == mrt->maxvif) {
+        if (vifi + 1 == mrt->maxvif) {
                int tmp;
-                for (tmp=vifi-1; tmp>=0; tmp--) {
+                for (tmp = vifi - 1; tmp >= 0; tmp--) {
                        if (VIF_EXISTS(mrt, tmp))
                                break;
                }
@@ -565,25 +569,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
        dev_set_allmulti(dev, -1);
-        if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+        in_dev = __in_dev_get_rtnl(dev);
+        if (in_dev) {
                IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
                ip_rt_multicast_event(in_dev);
        }
-        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
+        if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
                unregister_netdevice_queue(dev, head);
        dev_put(dev);
        return 0;
 }
-static inline void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free_rcu(struct rcu_head *head)
 {
+        struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
        kmem_cache_free(mrt_cachep, c);
 }
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+        call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
 /* Destroy an unresolved cache entry, killing queued skbs
-   and reporting error to netlink readers.
+ * and reporting error to netlink readers.
 */
 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +617,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
                        memset(&e->msg, 0, sizeof(e->msg));
                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
-                } else
+                } else {
                        kfree_skb(skb);
+                }
        }
        ipmr_cache_free(c);
@@ -724,13 +737,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
        case 0:
                if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
                        dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
-                        if (dev && dev->ip_ptr == NULL) {
+                        if (dev && __in_dev_get_rtnl(dev) == NULL) {
                                dev_put(dev);
                                return -EADDRNOTAVAIL;
                        }
-                } else
+                } else {
                        dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+                }
                if (!dev)
                        return -EADDRNOTAVAIL;
                err = dev_set_allmulti(dev, 1);
@@ -743,16 +756,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
                return -EINVAL;
        }
-        if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+        in_dev = __in_dev_get_rtnl(dev);
+        if (!in_dev) {
                dev_put(dev);
                return -EADDRNOTAVAIL;
        }
        IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
        ip_rt_multicast_event(in_dev);
-        /*
+        /* Fill in the VIF structures */
-         *      Fill in the VIF structures
-         */
        v->rate_limit = vifc->vifc_rate_limit;
        v->local = vifc->vifc_lcl_addr.s_addr;
        v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +778,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
        v->pkt_in = 0;
        v->pkt_out = 0;
        v->link = dev->ifindex;
-        if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+        if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
                v->link = dev->iflink;
        /* And finish update writing critical data */
        write_lock_bh(&mrt_lock);
        v->dev = dev;
 #ifdef CONFIG_IP_PIMSM
-        if (v->flags&VIFF_REGISTER)
+        if (v->flags & VIFF_REGISTER)
                mrt->mroute_reg_vif_num = vifi;
 #endif
        if (vifi+1 > mrt->maxvif)
@@ -781,6 +794,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
        return 0;
 }
+/* called with rcu_read_lock() */
 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
                                         __be32 origin,
                                         __be32 mcastgrp)
@@ -788,7 +802,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
        int line = MFC_HASH(mcastgrp, origin);
        struct mfc_cache *c;
-        list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+        list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
                if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
                        return c;
        }
@@ -801,19 +815,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_alloc(void)
 {
        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
-        if (c == NULL)
-                return NULL;
+        if (c)
-        c->mfc_un.res.minvif = MAXVIFS;
+                c->mfc_un.res.minvif = MAXVIFS;
        return c;
 }
 static struct mfc_cache *ipmr_cache_alloc_unres(void)
 {
        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
-        if (c == NULL)
-                return NULL;
+        if (c) {
-        skb_queue_head_init(&c->mfc_un.unres.unresolved);
+                skb_queue_head_init(&c->mfc_un.unres.unresolved);
-        c->mfc_un.unres.expires = jiffies + 10*HZ;
+                c->mfc_un.unres.expires = jiffies + 10*HZ;
+        }
        return c;
 }
@@ -827,17 +842,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
        struct sk_buff *skb;
        struct nlmsgerr *e;
-        /*
+        /* Play the pending entries through our router */
-         *      Play the pending entries through our router
-         */
        while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
                if (ip_hdr(skb)->version == 0) {
                        struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
                        if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
-                                nlh->nlmsg_len = (skb_tail_pointer(skb) -
+                                nlh->nlmsg_len = skb_tail_pointer(skb) -
-                                                  (u8 *)nlh);
+                                                 (u8 *)nlh;
                        } else {
                                nlh->nlmsg_type = NLMSG_ERROR;
                                nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +861,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
                        }
                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
-                } else
+                } else {
                        ip_mr_forward(net, mrt, skb, c, 0);
+                }
        }
 }
@@ -867,6 +881,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
        const int ihl = ip_hdrlen(pkt);
        struct igmphdr *igmp;
        struct igmpmsg *msg;
+        struct sock *mroute_sk;
        int ret;
 #ifdef CONFIG_IP_PIMSM
@@ -882,9 +897,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
 #ifdef CONFIG_IP_PIMSM
        if (assert == IGMPMSG_WHOLEPKT) {
                /* Ugly, but we have no choice with this interface.
-                   Duplicate old header, fix ihl, length etc.
+                 * Duplicate old header, fix ihl, length etc.
-                   And all this only to mangle msg->im_msgtype and
+                 * And all this only to mangle msg->im_msgtype and
-                   to set msg->im_mbz to "mbz" :-)
+                 * to set msg->im_mbz to "mbz" :-)
                 */
                skb_push(skb, sizeof(struct iphdr));
                skb_reset_network_header(skb);
@@ -901,39 +916,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
 #endif
        {
-        /*
+        /* Copy the IP header */
-         *      Copy the IP header
-         */
        skb->network_header = skb->tail;
        skb_put(skb, ihl);
        skb_copy_to_linear_data(skb, pkt->data, ihl);
-        ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
+        ip_hdr(skb)->protocol = 0;      /* Flag to the kernel this is a route add */
        msg = (struct igmpmsg *)skb_network_header(skb);
        msg->im_vif = vifi;
        skb_dst_set(skb, dst_clone(skb_dst(pkt)));
-        /*
+        /* Add our header */
-         *      Add our header
-         */
-        igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+        igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
        igmp->type      =
        msg->im_msgtype = assert;
-        igmp->code      =       0;
+        igmp->code      = 0;
-        ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
+        ip_hdr(skb)->tot_len = htons(skb->len);         /* Fix the length */
        skb->transport_header = skb->network_header;
        }
-        if (mrt->mroute_sk == NULL) {
+        rcu_read_lock();
+        mroute_sk = rcu_dereference(mrt->mroute_sk);
+        if (mroute_sk == NULL) {
+                rcu_read_unlock();
                kfree_skb(skb);
                return -EINVAL;
        }
-        /*
+        /* Deliver to mrouted */
-         *      Deliver to mrouted
-         */
+        ret = sock_queue_rcv_skb(mroute_sk, skb);
-        ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
+        rcu_read_unlock();
        if (ret < 0) {
                if (net_ratelimit())
                        printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +979,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
        }
        if (!found) {
-                /*
+                /* Create a new entry if allowable */
-                 *      Create a new entry if allowable
-                 */
                if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
                    (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +989,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
                        return -ENOBUFS;
                }
-                /*
+                /* Fill in the new cache entry */
-                 *      Fill in the new cache entry
-                 */
                c->mfc_parent   = -1;
                c->mfc_origin   = iph->saddr;
                c->mfc_mcastgrp = iph->daddr;
-                /*
+                /* Reflect first query at mrouted. */
-                 *      Reflect first query at mrouted.
-                 */
                err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
                if (err < 0) {
                        /* If the report failed throw the cache entry
@@ -1006,10 +1016,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
                        mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
        }
-        /*
+        /* See if we can append the packet */
-         *      See if we can append the packet
-         */
+        if (c->mfc_un.unres.unresolved.qlen > 3) {
-        if (c->mfc_un.unres.unresolved.qlen>3) {
                kfree_skb(skb);
                err = -ENOBUFS;
        } else {
@@ -1035,9 +1044,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
        list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
-                        write_lock_bh(&mrt_lock);
+                        list_del_rcu(&c->list);
-                        list_del(&c->list);
-                        write_unlock_bh(&mrt_lock);
                        ipmr_cache_free(c);
                        return 0;
@@ -1090,9 +1097,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
        if (!mrtsock)
                c->mfc_flags |= MFC_STATIC;
-        write_lock_bh(&mrt_lock);
+        list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
-        list_add(&c->list, &mrt->mfc_cache_array[line]);
-        write_unlock_bh(&mrt_lock);
        /*
         *      Check to see if we resolved a queued list. If so we
@@ -1130,26 +1135,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
        LIST_HEAD(list);
        struct mfc_cache *c, *next;
-        /*
+        /* Shut down all active vif entries */
-         *      Shut down all active vif entries
-         */
        for (i = 0; i < mrt->maxvif; i++) {
-                if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+                if (!(mrt->vif_table[i].flags & VIFF_STATIC))
                        vif_delete(mrt, i, 0, &list);
        }
        unregister_netdevice_many(&list);
-        /*
+        /* Wipe the cache */
-         *      Wipe the cache
-         */
        for (i = 0; i < MFC_LINES; i++) {
                list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
-                        if (c->mfc_flags&MFC_STATIC)
+                        if (c->mfc_flags & MFC_STATIC)
                                continue;
-                        write_lock_bh(&mrt_lock);
+                        list_del_rcu(&c->list);
-                        list_del(&c->list);
-                        write_unlock_bh(&mrt_lock);
                        ipmr_cache_free(c);
                }
        }
@@ -1164,6 +1164,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
        }
 }
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
 static void mrtsock_destruct(struct sock *sk)
 {
        struct net *net = sock_net(sk);
@@ -1171,13 +1174,9 @@ static void mrtsock_destruct(struct sock *sk)
        rtnl_lock();
        ipmr_for_each_table(mrt, net) {
-                if (sk == mrt->mroute_sk) {
+                if (sk == rtnl_dereference(mrt->mroute_sk)) {
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+                        rcu_assign_pointer(mrt->mroute_sk, NULL);
-                        write_lock_bh(&mrt_lock);
-                        mrt->mroute_sk = NULL;
-                        write_unlock_bh(&mrt_lock);
                        mroute_clean_tables(mrt);
                }
        }
@@ -1204,7 +1203,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                return -ENOENT;
        if (optname != MRT_INIT) {
-                if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
+                if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+                    !capable(CAP_NET_ADMIN))
                        return -EACCES;
        }
@@ -1217,23 +1217,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                        return -ENOPROTOOPT;
                rtnl_lock();
-                if (mrt->mroute_sk) {
+                if (rtnl_dereference(mrt->mroute_sk)) {
                        rtnl_unlock();
                        return -EADDRINUSE;
                }
                ret = ip_ra_control(sk, 1, mrtsock_destruct);
                if (ret == 0) {
-                        write_lock_bh(&mrt_lock);
+                        rcu_assign_pointer(mrt->mroute_sk, sk);
-                        mrt->mroute_sk = sk;
-                        write_unlock_bh(&mrt_lock);
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
                }
                rtnl_unlock();
                return ret;
        case MRT_DONE:
-                if (sk != mrt->mroute_sk)
+                if (sk != rcu_dereference_raw(mrt->mroute_sk))
                        return -EACCES;
                return ip_ra_control(sk, 0, NULL);
        case MRT_ADD_VIF:
@@ -1246,7 +1243,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                        return -ENFILE;
                rtnl_lock();
                if (optname == MRT_ADD_VIF) {
-                        ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
+                        ret = vif_add(net, mrt, &vif,
+                                      sk == rtnl_dereference(mrt->mroute_sk));
                } else {
                        ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
                }
@@ -1267,7 +1265,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                if (optname == MRT_DEL_MFC)
                        ret = ipmr_mfc_delete(mrt, &mfc);
                else
-                        ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
+                        ret = ipmr_mfc_add(net, mrt, &mfc,
+                                           sk == rtnl_dereference(mrt->mroute_sk));
                rtnl_unlock();
                return ret;
                /*
@@ -1276,7 +1275,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
        case MRT_ASSERT:
        {
                int v;
-                if (get_user(v,(int __user *)optval))
+                if (get_user(v, (int __user *)optval))
                        return -EFAULT;
                mrt->mroute_do_assert = (v) ? 1 : 0;
                return 0;
@@ -1286,7 +1285,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
        {
                int v;
-                if (get_user(v,(int __user *)optval))
+                if (get_user(v, (int __user *)optval))
                        return -EFAULT;
                v = (v) ? 1 : 0;
@@ -1309,14 +1308,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                        return -EINVAL;
                if (get_user(v, (u32 __user *)optval))
                        return -EFAULT;
-                if (sk == mrt->mroute_sk)
-                        return -EBUSY;
                rtnl_lock();
                ret = 0;
-                if (!ipmr_new_table(net, v))
+                if (sk == rtnl_dereference(mrt->mroute_sk)) {
-                        ret = -ENOMEM;
+                        ret = -EBUSY;
-                raw_sk(sk)->ipmr_table = v;
+                } else {
+                        if (!ipmr_new_table(net, v))
+                                ret = -ENOMEM;
+                        raw_sk(sk)->ipmr_table = v;
+                }
                rtnl_unlock();
                return ret;
        }
@@ -1347,9 +1348,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
        if (optname != MRT_VERSION &&
 #ifdef CONFIG_IP_PIMSM
-           optname!=MRT_PIM &&
+           optname != MRT_PIM &&
 #endif
-           optname!=MRT_ASSERT)
+           optname != MRT_ASSERT)
                return -ENOPROTOOPT;
        if (get_user(olr, optlen))
@@ -1416,24 +1417,99 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
                if (copy_from_user(&sr, arg, sizeof(sr)))
                        return -EFAULT;
-                read_lock(&mrt_lock);
+                rcu_read_lock();
                c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
                if (c) {
                        sr.pktcnt = c->mfc_un.res.pkt;
                        sr.bytecnt = c->mfc_un.res.bytes;
                        sr.wrong_if = c->mfc_un.res.wrong_if;
-                        read_unlock(&mrt_lock);
+                        rcu_read_unlock();
                        if (copy_to_user(arg, &sr, sizeof(sr)))
                                return -EFAULT;
                        return 0;
                }
+                rcu_read_unlock();
+                return -EADDRNOTAVAIL;
+        default:
+                return -ENOIOCTLCMD;
+        }
+}
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+        struct in_addr src;
+        struct in_addr grp;
+        compat_ulong_t pktcnt;
+        compat_ulong_t bytecnt;
+        compat_ulong_t wrong_if;
+};
+struct compat_sioc_vif_req {
+        vifi_t  vifi;           /* Which iface */
+        compat_ulong_t icount;
+        compat_ulong_t ocount;
+        compat_ulong_t ibytes;
+        compat_ulong_t obytes;
+};
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+        struct compat_sioc_sg_req sr;
+        struct compat_sioc_vif_req vr;
+        struct vif_device *vif;
+        struct mfc_cache *c;
+        struct net *net = sock_net(sk);
+        struct mr_table *mrt;
+        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+        if (mrt == NULL)
+                return -ENOENT;
+        switch (cmd) {
+        case SIOCGETVIFCNT:
+                if (copy_from_user(&vr, arg, sizeof(vr)))
+                        return -EFAULT;
+                if (vr.vifi >= mrt->maxvif)
+                        return -EINVAL;
+                read_lock(&mrt_lock);
+                vif = &mrt->vif_table[vr.vifi];
+                if (VIF_EXISTS(mrt, vr.vifi)) {
+                        vr.icount = vif->pkt_in;
+                        vr.ocount = vif->pkt_out;
+                        vr.ibytes = vif->bytes_in;
+                        vr.obytes = vif->bytes_out;
+                        read_unlock(&mrt_lock);
+                        if (copy_to_user(arg, &vr, sizeof(vr)))
+                                return -EFAULT;
+                        return 0;
+                }
                read_unlock(&mrt_lock);
                return -EADDRNOTAVAIL;
+        case SIOCGETSGCNT:
+                if (copy_from_user(&sr, arg, sizeof(sr)))
+                        return -EFAULT;
+                rcu_read_lock();
+                c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+                if (c) {
+                        sr.pktcnt = c->mfc_un.res.pkt;
+                        sr.bytecnt = c->mfc_un.res.bytes;
+                        sr.wrong_if = c->mfc_un.res.wrong_if;
+                        rcu_read_unlock();
+                        if (copy_to_user(arg, &sr, sizeof(sr)))
+                                return -EFAULT;
+                        return 0;
+                }
+                rcu_read_unlock();
+                return -EADDRNOTAVAIL;
        default:
                return -ENOIOCTLCMD;
        }
 }
+#endif
 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
@@ -1465,7 +1541,7 @@ static struct notifier_block ip_mr_notifier = {
 };
 /*
- *      Encapsulate a packet by attaching a valid IPIP header to it.
+ *      Encapsulate a packet by attaching a valid IPIP header to it.
 *      This avoids tunnel drivers and other mess and gives us the speed so
 *      important for multicast video.
 */
@@ -1473,14 +1549,14 @@ static struct notifier_block ip_mr_notifier = {
 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 {
        struct iphdr *iph;
-        struct iphdr *old_iph = ip_hdr(skb);
+        const struct iphdr *old_iph = ip_hdr(skb);
        skb_push(skb, sizeof(struct iphdr));
        skb->transport_header = skb->network_header;
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
-        iph->version    =       4;
+        iph->version    =       4;
        iph->tos        =       old_iph->tos;
        iph->ttl        =       old_iph->ttl;
        iph->frag_off   =       0;
@@ -1498,7 +1574,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
-        struct ip_options * opt = &(IPCB(skb)->opt);
+        struct ip_options *opt = &(IPCB(skb)->opt);
        IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
@@ -1519,6 +1595,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
        struct vif_device *vif = &mrt->vif_table[vifi];
        struct net_device *dev;
        struct rtable *rt;
+        struct flowi4 fl4;
        int    encap = 0;
        if (vif->dev == NULL)
@@ -1535,23 +1612,21 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
        }
 #endif
-        if (vif->flags&VIFF_TUNNEL) {
+        if (vif->flags & VIFF_TUNNEL) {
-                struct flowi fl = { .oif = vif->link,
+                rt = ip_route_output_ports(net, &fl4, NULL,
-                                    .nl_u = { .ip4_u =
+                                           vif->remote, vif->local,
-                                              { .daddr = vif->remote,
+                                           0, 0,
-                                                .saddr = vif->local,
+                                           IPPROTO_IPIP,
-                                                .tos = RT_TOS(iph->tos) } },
+                                           RT_TOS(iph->tos), vif->link);
-                                    .proto = IPPROTO_IPIP };
+                if (IS_ERR(rt))
-                if (ip_route_output_key(net, &rt, &fl))
                        goto out_free;
                encap = sizeof(struct iphdr);
        } else {
-                struct flowi fl = { .oif = vif->link,
+                rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
-                                    .nl_u = { .ip4_u =
+                                           0, 0,
-                                              { .daddr = iph->daddr,
+                                           IPPROTO_IPIP,
-                                                .tos = RT_TOS(iph->tos) } },
+                                           RT_TOS(iph->tos), vif->link);
-                                    .proto = IPPROTO_IPIP };
+                if (IS_ERR(rt))
-                if (ip_route_output_key(net, &rt, &fl))
                        goto out_free;
        }
@@ -1559,8 +1634,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
        if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
                /* Do not fragment multicasts. Alas, IPv4 does not
-                   allow to send ICMP, so that packets will disappear
+                 * allow to send ICMP, so that packets will disappear
-                   to blackhole.
+                 * to blackhole.
                 */
                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1658,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
        ip_decrease_ttl(ip_hdr(skb));
        /* FIXME: forward and output firewalls used to be called here.
-         * What do we do with netfilter? -- RR */
+         * What do we do with netfilter? -- RR
+         */
        if (vif->flags & VIFF_TUNNEL) {
                ip_encap(skb, vif->local, vif->remote);
                /* FIXME: extra output firewall step used to be here. --RR */
@@ -1642,17 +1718,17 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
        if (mrt->vif_table[vif].dev != skb->dev) {
                int true_vifi;
-                if (skb_rtable(skb)->fl.iif == 0) {
+                if (rt_is_output_route(skb_rtable(skb))) {
                        /* It is our own packet, looped back.
-                           Very complicated situation...
+                         * Very complicated situation...
+                         *
-                           The best workaround until routing daemons will be
+                         * The best workaround until routing daemons will be
-                           fixed is not to redistribute packet, if it was
+                         * fixed is not to redistribute packet, if it was
-                           send through wrong interface. It means, that
+                         * send through wrong interface. It means, that
-                           multicast applications WILL NOT work for
+                         * multicast applications WILL NOT work for
-                           (S,G), which have default multicast route pointing
+                         * (S,G), which have default multicast route pointing
-                           to wrong oif. In any case, it is not a good
+                         * to wrong oif. In any case, it is not a good
-                           idea to use multicasting applications on router.
+                         * idea to use multicasting applications on router.
                         */
                        goto dont_forward;
                }
@@ -1662,9 +1738,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
                if (true_vifi >= 0 && mrt->mroute_do_assert &&
                    /* pimsm uses asserts, when switching from RPT to SPT,
-                       so that we cannot check that packet arrived on an oif.
+                     * so that we cannot check that packet arrived on an oif.
-                       It is bad, but otherwise we would need to move pretty
+                     * It is bad, but otherwise we would need to move pretty
-                       large chunk of pimd to kernel. Ough... --ANK
+                     * large chunk of pimd to kernel. Ough... --ANK
                     */
                    (mrt->mroute_do_pim ||
                     cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1758,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
        /*
         *      Forward the frame
         */
-        for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+        for (ct = cache->mfc_un.res.maxvif - 1;
+             ct >= cache->mfc_un.res.minvif; ct--) {
                if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
                        if (psend != -1) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
                                        ipmr_queue_xmit(net, mrt, skb2, cache,
                                                        psend);
@@ -1696,6 +1774,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
        if (psend != -1) {
                if (local) {
                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                        if (skb2)
                                ipmr_queue_xmit(net, mrt, skb2, cache, psend);
                } else {
@@ -1710,9 +1789,30 @@ dont_forward:
        return 0;
 }
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+        struct rtable *rt = skb_rtable(skb);
+        struct iphdr *iph = ip_hdr(skb);
+        struct flowi4 fl4 = {
+                .daddr = iph->daddr,
+                .saddr = iph->saddr,
+                .flowi4_tos = iph->tos,
+                .flowi4_oif = rt->rt_oif,
+                .flowi4_iif = rt->rt_iif,
+                .flowi4_mark = rt->rt_mark,
+        };
+        struct mr_table *mrt;
+        int err;
+        err = ipmr_fib_lookup(net, &fl4, &mrt);
+        if (err)
+                return ERR_PTR(err);
+        return mrt;
+}
 /*
 *      Multicast packets for forwarding arrive here
+ *      Called with rcu_read_lock();
 */
 int ip_mr_input(struct sk_buff *skb)
@@ -1721,43 +1821,41 @@ int ip_mr_input(struct sk_buff *skb)
        struct net *net = dev_net(skb->dev);
        int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
        struct mr_table *mrt;
-        int err;
        /* Packet is looped back after forward, it should not be
-           forwarded second time, but still can be delivered locally.
+         * forwarded second time, but still can be delivered locally.
         */
-        if (IPCB(skb)->flags&IPSKB_FORWARDED)
+        if (IPCB(skb)->flags & IPSKB_FORWARDED)
                goto dont_forward;
-        err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
+        mrt = ipmr_rt_fib_lookup(net, skb);
-        if (err < 0) {
+        if (IS_ERR(mrt)) {
                kfree_skb(skb);
-                return err;
+                return PTR_ERR(mrt);
        }
        if (!local) {
-                    if (IPCB(skb)->opt.router_alert) {
+                if (IPCB(skb)->opt.router_alert) {
-                            if (ip_call_ra_chain(skb))
+                        if (ip_call_ra_chain(skb))
-                                    return 0;
+                                return 0;
-                    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
+                } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
-                            /* IGMPv1 (and broken IGMPv2 implementations sort of
+                        /* IGMPv1 (and broken IGMPv2 implementations sort of
-                               Cisco IOS <= 11.2(8)) do not put router alert
+                         * Cisco IOS <= 11.2(8)) do not put router alert
-                               option to IGMP packets destined to routable
+                         * option to IGMP packets destined to routable
-                               groups. It is very bad, because it means
+                         * groups. It is very bad, because it means
-                               that we can forward NO IGMP messages.
+                         * that we can forward NO IGMP messages.
-                             */
+                         */
-                            read_lock(&mrt_lock);
+                        struct sock *mroute_sk;
-                            if (mrt->mroute_sk) {
-                                    nf_reset(skb);
+                        mroute_sk = rcu_dereference(mrt->mroute_sk);
-                                    raw_rcv(mrt->mroute_sk, skb);
+                        if (mroute_sk) {
-                                    read_unlock(&mrt_lock);
+                                nf_reset(skb);
-                                    return 0;
+                                raw_rcv(mroute_sk, skb);
-                            }
+                                return 0;
-                            read_unlock(&mrt_lock);
+                        }
                    }
        }
-        read_lock(&mrt_lock);
+        /* already under rcu_read_lock() */
        cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
        /*
@@ -1769,13 +1867,12 @@ int ip_mr_input(struct sk_buff *skb)
                if (local) {
                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                        ip_local_deliver(skb);
-                        if (skb2 == NULL) {
+                        if (skb2 == NULL)
-                                read_unlock(&mrt_lock);
                                return -ENOBUFS;
-                        }
                        skb = skb2;
                }
+                read_lock(&mrt_lock);
                vif = ipmr_find_vif(mrt, skb->dev);
                if (vif >= 0) {
                        int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1885,8 @@ int ip_mr_input(struct sk_buff *skb)
                return -ENODEV;
        }
+        read_lock(&mrt_lock);
        ip_mr_forward(net, mrt, skb, cache, local);
        read_unlock(&mrt_lock);
        if (local)
@@ -1805,6 +1902,7 @@ dont_forward:
 }
 #ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
                     unsigned int pimlen)
 {
@@ -1813,10 +1911,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
        encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
        /*
-           Check that:
+         * Check that:
-           a. packet is really destinted to a multicast group
+         * a. packet is really sent to a multicast group
-           b. packet is not a NULL-REGISTER
+         * b. packet is not a NULL-REGISTER
-           c. packet is not truncated
+         * c. packet is not truncated
         */
        if (!ipv4_is_multicast(encap->daddr) ||
            encap->tot_len == 0 ||
@@ -1826,26 +1924,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
        read_lock(&mrt_lock);
        if (mrt->mroute_reg_vif_num >= 0)
                reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
-        if (reg_dev)
-                dev_hold(reg_dev);
        read_unlock(&mrt_lock);
        if (reg_dev == NULL)
                return 1;
        skb->mac_header = skb->network_header;
-        skb_pull(skb, (u8*)encap - skb->data);
+        skb_pull(skb, (u8 *)encap - skb->data);
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IP);
-        skb->ip_summed = 0;
+        skb->ip_summed = CHECKSUM_NONE;
        skb->pkt_type = PACKET_HOST;
        skb_tunnel_rx(skb, reg_dev);
        netif_rx(skb);
-        dev_put(reg_dev);
-        return 0;
+        return NET_RX_SUCCESS;
 }
 #endif
@@ -1854,7 +1949,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 * Handle IGMP messages of PIMv1
 */
-int pim_rcv_v1(struct sk_buff * skb)
+int pim_rcv_v1(struct sk_buff *skb)
 {
        struct igmphdr *pim;
        struct net *net = dev_net(skb->dev);
@@ -1865,9 +1960,9 @@ int pim_rcv_v1(struct sk_buff * skb)
        pim = igmp_hdr(skb);
-        if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+        mrt = ipmr_rt_fib_lookup(net, skb);
+        if (IS_ERR(mrt))
                goto drop;
        if (!mrt->mroute_do_pim ||
            pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
                goto drop;
@@ -1881,7 +1976,7 @@ drop:
 #endif
 #ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+static int pim_rcv(struct sk_buff *skb)
 {
        struct pimreghdr *pim;
        struct net *net = dev_net(skb->dev);
@@ -1891,15 +1986,15 @@ static int pim_rcv(struct sk_buff * skb)
                goto drop;
        pim = (struct pimreghdr *)skb_transport_header(skb);
-        if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
+        if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
-            (pim->flags&PIM_NULL_REGISTER) ||
+            (pim->flags & PIM_NULL_REGISTER) ||
            (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
             csum_fold(skb_checksum(skb, 0, skb->len, 0))))
                goto drop;
-        if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+        mrt = ipmr_rt_fib_lookup(net, skb);
+        if (IS_ERR(mrt))
                goto drop;
        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
 drop:
                kfree_skb(skb);
@@ -1946,40 +2041,45 @@ rtattr_failure:
        return -EMSGSIZE;
 }
-int ipmr_get_route(struct net *net,
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
-                   struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+                   __be32 saddr, __be32 daddr,
+                   struct rtmsg *rtm, int nowait)
 {
-        int err;
-        struct mr_table *mrt;
        struct mfc_cache *cache;
-        struct rtable *rt = skb_rtable(skb);
+        struct mr_table *mrt;
+        int err;
        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
        if (mrt == NULL)
                return -ENOENT;
-        read_lock(&mrt_lock);
+        rcu_read_lock();
-        cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
+        cache = ipmr_cache_find(mrt, saddr, daddr);
        if (cache == NULL) {
                struct sk_buff *skb2;
                struct iphdr *iph;
                struct net_device *dev;
-                int vif;
+                int vif = -1;
                if (nowait) {
-                        read_unlock(&mrt_lock);
+                        rcu_read_unlock();
                        return -EAGAIN;
                }
                dev = skb->dev;
-                if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
+                read_lock(&mrt_lock);
+                if (dev)
+                        vif = ipmr_find_vif(mrt, dev);
+                if (vif < 0) {
                        read_unlock(&mrt_lock);
+                        rcu_read_unlock();
                        return -ENODEV;
                }
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (!skb2) {
                        read_unlock(&mrt_lock);
+                        rcu_read_unlock();
                        return -ENOMEM;
                }
@@ -1987,18 +2087,21 @@ int ipmr_get_route(struct net *net,
                skb_reset_network_header(skb2);
                iph = ip_hdr(skb2);
                iph->ihl = sizeof(struct iphdr) >> 2;
-                iph->saddr = rt->rt_src;
+                iph->saddr = saddr;
-                iph->daddr = rt->rt_dst;
+                iph->daddr = daddr;
                iph->version = 0;
                err = ipmr_cache_unresolved(mrt, vif, skb2);
                read_unlock(&mrt_lock);
+                rcu_read_unlock();
                return err;
        }
-        if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+        read_lock(&mrt_lock);
+        if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
                cache->mfc_flags |= MFC_NOTIFY;
        err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
        read_unlock(&mrt_lock);
+        rcu_read_unlock();
        return err;
 }
@@ -2050,14 +2153,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
        s_h = cb->args[1];
        s_e = cb->args[2];
-        read_lock(&mrt_lock);
+        rcu_read_lock();
        ipmr_for_each_table(mrt, net) {
                if (t < s_t)
                        goto next_table;
                if (t > s_t)
                        s_h = 0;
                for (h = s_h; h < MFC_LINES; h++) {
-                        list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+                        list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
                                if (e < s_e)
                                        goto next_entry;
                                if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2178,7 @@ next_table:
                t++;
        }
 done:
-        read_unlock(&mrt_lock);
+        rcu_read_unlock();
        cb->args[2] = e;
        cb->args[1] = h;
@@ -2086,7 +2189,8 @@ done:
 #ifdef CONFIG_PROC_FS
 /*
- *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ *      The /proc interfaces to multicast routing :
+ *      /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
 */
 struct ipmr_vif_iter {
        struct seq_net_private p;
@@ -2208,14 +2312,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
        struct mr_table *mrt = it->mrt;
        struct mfc_cache *mfc;
-        read_lock(&mrt_lock);
+        rcu_read_lock();
        for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
                it->cache = &mrt->mfc_cache_array[it->ct];
-                list_for_each_entry(mfc, it->cache, list)
+                list_for_each_entry_rcu(mfc, it->cache, list)
                        if (pos-- == 0)
                                return mfc;
        }
-        read_unlock(&mrt_lock);
+        rcu_read_unlock();
        spin_lock_bh(&mfc_unres_lock);
        it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2378,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        }
        /* exhausted cache_array, show unresolved */
-        read_unlock(&mrt_lock);
+        rcu_read_unlock();
        it->cache = &mrt->mfc_unres_queue;
        it->ct = 0;
@@ -2282,7 +2386,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        if (!list_empty(it->cache))
                return list_first_entry(it->cache, struct mfc_cache, list);
- end_of_list:
+end_of_list:
        spin_unlock_bh(&mfc_unres_lock);
        it->cache = NULL;
@@ -2297,7 +2401,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
        if (it->cache == &mrt->mfc_unres_queue)
                spin_unlock_bh(&mfc_unres_lock);
        else if (it->cache == &mrt->mfc_cache_array[it->ct])
-                read_unlock(&mrt_lock);
+                rcu_read_unlock();
 }
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2427,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
                                   mfc->mfc_un.res.bytes,
                                   mfc->mfc_un.res.wrong_if);
                        for (n = mfc->mfc_un.res.minvif;
-                             n < mfc->mfc_un.res.maxvif; n++ ) {
+                             n < mfc->mfc_un.res.maxvif; n++) {
                                if (VIF_EXISTS(mrt, n) &&
                                    mfc->mfc_un.res.ttls[n] < 255)
                                        seq_printf(seq,
@@ -2421,7 +2525,7 @@ int __init ip_mr_init(void)
        mrt_cachep = kmem_cache_create("ip_mrt_cache",
                                       sizeof(struct mfc_cache),
-                                       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+                                       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
                                       NULL);
        if (!mrt_cachep)
                return -ENOMEM;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index d88a46c54fd1..2e97e3ec1eb7 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -16,60 +16,47 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
        struct net *net = dev_net(skb_dst(skb)->dev);
        const struct iphdr *iph = ip_hdr(skb);
        struct rtable *rt;
-        struct flowi fl = {};
+        struct flowi4 fl4 = {};
-        unsigned long orefdst;
+        __be32 saddr = iph->saddr;
+        __u8 flags = 0;
        unsigned int hh_len;
-        unsigned int type;
-        type = inet_addr_type(net, iph->saddr);
+        if (!skb->sk && addr_type != RTN_LOCAL) {
-        if (skb->sk && inet_sk(skb->sk)->transparent)
+                if (addr_type == RTN_UNSPEC)
-                type = RTN_LOCAL;
+                        addr_type = inet_addr_type(net, saddr);
-        if (addr_type == RTN_UNSPEC)
+                if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
-                addr_type = type;
+                        flags |= FLOWI_FLAG_ANYSRC;
+                else
+                        saddr = 0;
+        }
        /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
         * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
         */
-        if (addr_type == RTN_LOCAL) {
+        fl4.daddr = iph->daddr;
-                fl.nl_u.ip4_u.daddr = iph->daddr;
+        fl4.saddr = saddr;
-                if (type == RTN_LOCAL)
+        fl4.flowi4_tos = RT_TOS(iph->tos);
-                        fl.nl_u.ip4_u.saddr = iph->saddr;
+        fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
-                fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+        fl4.flowi4_mark = skb->mark;
-                fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+        fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags;
-                fl.mark = skb->mark;
+        rt = ip_route_output_key(net, &fl4);
-                fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+        if (IS_ERR(rt))
-                if (ip_route_output_key(net, &rt, &fl) != 0)
+                return -1;
-                        return -1;
-                /* Drop old route. */
-                skb_dst_drop(skb);
-                skb_dst_set(skb, &rt->dst);
-        } else {
-                /* non-local src, find valid iif to satisfy
-                 * rp-filter when calling ip_route_input. */
-                fl.nl_u.ip4_u.daddr = iph->saddr;
-                if (ip_route_output_key(net, &rt, &fl) != 0)
-                        return -1;
-                orefdst = skb->_skb_refdst;
+        /* Drop old route. */
-                if (ip_route_input(skb, iph->daddr, iph->saddr,
+        skb_dst_drop(skb);
-                                   RT_TOS(iph->tos), rt->dst.dev) != 0) {
+        skb_dst_set(skb, &rt->dst);
-                        dst_release(&rt->dst);
-                        return -1;
-                }
-                dst_release(&rt->dst);
-                refdst_drop(orefdst);
-        }
        if (skb_dst(skb)->error)
                return -1;
 #ifdef CONFIG_XFRM
        if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
-            xfrm_decode_session(skb, &fl, AF_INET) == 0) {
+            xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
                struct dst_entry *dst = skb_dst(skb);
                skb_dst_set(skb, NULL);
-                if (xfrm_lookup(net, &dst, &fl, skb->sk, 0))
+                dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+                if (IS_ERR(dst))
                        return -1;
                skb_dst_set(skb, dst);
        }
@@ -102,7 +89,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
                dst = ((struct xfrm_dst *)dst)->route;
        dst_hold(dst);
-        if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
+        dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+        if (IS_ERR(dst))
                return -1;
        skb_dst_drop(skb);
@@ -217,9 +205,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
        return csum;
 }
-static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
+static int nf_ip_route(struct net *net, struct dst_entry **dst,
+                       struct flowi *fl, bool strict __always_unused)
 {
-        return ip_route_output_key(&init_net, (struct rtable **)dst, fl);
+        struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+        if (IS_ERR(rt))
+                return PTR_ERR(rt);
+        *dst = &rt->dst;
+        return 0;
 }
 static const struct nf_afinfo nf_ip_afinfo = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..1dfc18a03fd4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -64,16 +64,6 @@ config IP_NF_IPTABLES
 if IP_NF_IPTABLES
 # The matches.
-config IP_NF_MATCH_ADDRTYPE
-        tristate '"addrtype" address type match support'
-        depends on NETFILTER_ADVANCED
-        help
-          This option allows you to match what routing thinks of an address,
-          eg. UNICAST, LOCAL, BROADCAST, ...
-          If you want to compile it as a module, say M here and read
-          <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 config IP_NF_MATCH_AH
        tristate '"ah" match support'
        depends on NETFILTER_ADVANCED
@@ -147,7 +137,7 @@ config IP_NF_TARGET_ULOG
          which can only be viewed through syslog.
          The appropriate userspace logging daemon (ulogd) may be obtained from
-          <http://www.gnumonks.org/projects/ulogd/>
+          <http://www.netfilter.org/projects/ulogd/index.html>
          To compile it as a module, choose M here.  If unsure, say N.
@@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT
 config NF_NAT_SNMP_BASIC
        tristate "Basic SNMP-ALG support"
-        depends on NF_NAT
+        depends on NF_CONNTRACK_SNMP && NF_NAT
        depends on NETFILTER_ADVANCED
+        default NF_NAT && NF_CONNTRACK_SNMP
        ---help---
          This module implements an Application Layer Gateway (ALG) for
@@ -324,10 +315,10 @@ config IP_NF_TARGET_ECN
 config IP_NF_TARGET_TTL
        tristate '"TTL" target support'
-        depends on NETFILTER_ADVANCED
+        depends on NETFILTER_ADVANCED && IP_NF_MANGLE
        select NETFILTER_XT_TARGET_HL
        ---help---
-        This is a backwards-compat option for the user's convenience
+        This is a backwards-compatible option for the user's convenience
        (e.g. when running oldconfig). It selects
        CONFIG_NETFILTER_XT_TARGET_HL.
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 48111594ee9b..dca2082ec683 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,15 +3,15 @@
 #
 # objects for l3 independent conntrack
-nf_conntrack_ipv4-objs  :=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
+nf_conntrack_ipv4-y     :=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
 ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
 ifeq ($(CONFIG_PROC_FS),y)
 nf_conntrack_ipv4-objs  += nf_conntrack_l3proto_ipv4_compat.o
 endif
 endif
-nf_nat-objs             := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
+nf_nat-y                := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
-iptable_nat-objs        := nf_nat_rule.o nf_nat_standalone.o
+iptable_nat-y   := nf_nat_rule.o nf_nat_standalone.o
 # connection tracking
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
 # matches
-obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
 obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8f4f9a57f12..fd7a3f68917f 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,11 +72,11 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
        for (i = 0; i < len; i++)
                ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
-        return (ret != 0);
+        return ret != 0;
 }
 /*
- * Unfortunatly, _b and _mask are not aligned to an int (or long int)
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
 * Some arches dont care, unrolling the loop is a win on them.
 * For other arches, we only have a 16bit alignement.
 */
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
        return NF_DROP;
 }
-static inline const struct arpt_entry_target *
+static inline const struct xt_entry_target *
 arpt_get_target_c(const struct arpt_entry *e)
 {
        return arpt_get_target((struct arpt_entry *)e);
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
        void *table_base;
        const struct xt_table_info *private;
        struct xt_action_param acpar;
+        unsigned int addend;
        if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
                return NF_DROP;
@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
        indev = in ? in->name : nulldevname;
        outdev = out ? out->name : nulldevname;
-        xt_info_rdlock_bh();
+        local_bh_disable();
+        addend = xt_write_recseq_begin();
        private = table->private;
        table_base = private->entries[smp_processor_id()];
@@ -282,7 +284,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
        arp = arp_hdr(skb);
        do {
-                const struct arpt_entry_target *t;
+                const struct xt_entry_target *t;
                if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
                        e = arpt_next_entry(e);
@@ -297,10 +299,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
                if (!t->u.kernel.target->target) {
                        int v;
-                        v = ((struct arpt_standard_target *)t)->verdict;
+                        v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
-                                if (v != ARPT_RETURN) {
+                                if (v != XT_RETURN) {
                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
@@ -332,13 +334,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
                /* Target might have changed stuff. */
                arp = arp_hdr(skb);
-                if (verdict == ARPT_CONTINUE)
+                if (verdict == XT_CONTINUE)
                        e = arpt_next_entry(e);
                else
                        /* Verdict */
                        break;
        } while (!acpar.hotdrop);
-        xt_info_rdunlock_bh();
+        xt_write_recseq_end(addend);
+        local_bh_enable();
        if (acpar.hotdrop)
                return NF_DROP;
@@ -377,7 +380,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
                e->counters.pcnt = pos;
                for (;;) {
-                        const struct arpt_standard_target *t
+                        const struct xt_standard_target *t
                                = (void *)arpt_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);
@@ -392,13 +395,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
                        /* Unconditional return/END. */
                        if ((e->target_offset == sizeof(struct arpt_entry) &&
                             (strcmp(t->target.u.user.name,
-                                     ARPT_STANDARD_TARGET) == 0) &&
+                                     XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0 && unconditional(&e->arp)) ||
                            visited) {
                                unsigned int oldpos, size;
                                if ((strcmp(t->target.u.user.name,
-                                            ARPT_STANDARD_TARGET) == 0) &&
+                                            XT_STANDARD_TARGET) == 0) &&
                                    t->verdict < -NF_MAX_VERDICT - 1) {
                                        duprintf("mark_source_chains: bad "
                                                "negative verdict (%i)\n",
@@ -433,7 +436,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
                                int newpos = t->verdict;
                                if (strcmp(t->target.u.user.name,
-                                           ARPT_STANDARD_TARGET) == 0 &&
+                                           XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        if (newpos > newinfo->size -
                                                sizeof(struct arpt_entry)) {
@@ -464,14 +467,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 static inline int check_entry(const struct arpt_entry *e, const char *name)
 {
-        const struct arpt_entry_target *t;
+        const struct xt_entry_target *t;
        if (!arp_checkentry(&e->arp)) {
                duprintf("arp_tables: arp check failed %p %s.\n", e, name);
                return -EINVAL;
        }
-        if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
+        if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
                return -EINVAL;
        t = arpt_get_target_c(e);
@@ -483,7 +486,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
 static inline int check_target(struct arpt_entry *e, const char *name)
 {
-        struct arpt_entry_target *t = arpt_get_target(e);
+        struct xt_entry_target *t = arpt_get_target(e);
        int ret;
        struct xt_tgchk_param par = {
                .table     = name,
@@ -506,7 +509,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 static inline int
 find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
-        struct arpt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
@@ -536,7 +539,7 @@ out:
 static bool check_underflow(const struct arpt_entry *e)
 {
-        const struct arpt_entry_target *t;
+        const struct xt_entry_target *t;
        unsigned int verdict;
        if (!unconditional(&e->arp))
@@ -544,7 +547,7 @@ static bool check_underflow(const struct arpt_entry *e)
        t = arpt_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
-        verdict = ((struct arpt_standard_target *)t)->verdict;
+        verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -566,7 +569,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
        }
        if (e->next_offset
-            < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+            < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
                duprintf("checking: element %p size %u\n",
                         e, e->next_offset);
                return -EINVAL;
@@ -598,7 +601,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 static inline void cleanup_entry(struct arpt_entry *e)
 {
        struct xt_tgdtor_param par;
-        struct arpt_entry_target *t;
+        struct xt_entry_target *t;
        t = arpt_get_target(e);
        par.target   = t->u.kernel.target;
@@ -710,42 +713,25 @@ static void get_counters(const struct xt_table_info *t,
        struct arpt_entry *iter;
        unsigned int cpu;
        unsigned int i;
-        unsigned int curcpu = get_cpu();
-        /* Instead of clearing (by a previous call to memset())
-         * the counters and using adds, we set the counters
-         * with data used by 'current' CPU
-         *
-         * Bottom half has to be disabled to prevent deadlock
-         * if new softirq were to run and call ipt_do_table
-         */
-        local_bh_disable();
-        i = 0;
-        xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-                SET_COUNTER(counters[i], iter->counters.bcnt,
-                            iter->counters.pcnt);
-                ++i;
-        }
-        local_bh_enable();
-        /* Processing counters from other cpus, we can let bottom half enabled,
-         * (preemption is disabled)
-         */
        for_each_possible_cpu(cpu) {
-                if (cpu == curcpu)
+                seqcount_t *s = &per_cpu(xt_recseq, cpu);
-                        continue;
                i = 0;
-                local_bh_disable();
-                xt_info_wrlock(cpu);
                xt_entry_foreach(iter, t->entries[cpu], t->size) {
-                        ADD_COUNTER(counters[i], iter->counters.bcnt,
+                        u64 bcnt, pcnt;
-                                    iter->counters.pcnt);
+                        unsigned int start;
+                        do {
+                                start = read_seqcount_begin(s);
+                                bcnt = iter->counters.bcnt;
+                                pcnt = iter->counters.pcnt;
+                        } while (read_seqcount_retry(s, start));
+                        ADD_COUNTER(counters[i], bcnt, pcnt);
                        ++i;
                }
-                xt_info_wrunlock(cpu);
-                local_bh_enable();
        }
-        put_cpu();
 }
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -759,7 +745,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
         * about).
         */
        countersize = sizeof(struct xt_counters) * private->number;
-        counters = vmalloc(countersize);
+        counters = vzalloc(countersize);
        if (counters == NULL)
                return ERR_PTR(-ENOMEM);
@@ -794,7 +780,7 @@ static int copy_entries_to_user(unsigned int total_size,
        /* FIXME: use iterator macros --RR */
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
-                const struct arpt_entry_target *t;
+                const struct xt_entry_target *t;
                e = (struct arpt_entry *)(loc_cpu_entry + off);
                if (copy_to_user(userptr + off
@@ -807,7 +793,7 @@ static int copy_entries_to_user(unsigned int total_size,
                t = arpt_get_target_c(e);
                if (copy_to_user(userptr + off + e->target_offset
-                                 + offsetof(struct arpt_entry_target,
+                                 + offsetof(struct xt_entry_target,
                                            u.user.name),
                                 t->u.kernel.target->name,
                                 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -844,7 +830,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
                             const struct xt_table_info *info,
                             const void *base, struct xt_table_info *newinfo)
 {
-        const struct arpt_entry_target *t;
+        const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;
@@ -883,6 +869,7 @@ static int compat_table_info(const struct xt_table_info *info,
        memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
        newinfo->initial_entries = 0;
        loc_cpu_entry = info->entries[raw_smp_processor_id()];
+        xt_compat_init_offsets(NFPROTO_ARP, info->number);
        xt_entry_foreach(iter, loc_cpu_entry, info->size) {
                ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
                if (ret != 0)
@@ -895,7 +882,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                    const int *len, int compat)
 {
-        char name[ARPT_TABLE_MAXNAMELEN];
+        char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;
@@ -908,7 +895,7 @@ static int get_info(struct net *net, void __user *user,
        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;
-        name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+        name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
        if (compat)
                xt_compat_lock(NFPROTO_ARP);
@@ -927,6 +914,7 @@ static int get_info(struct net *net, void __user *user,
                        private = &tmp;
                }
 #endif
+                memset(&info, 0, sizeof(info));
                info.valid_hooks = t->valid_hooks;
                memcpy(info.hook_entry, private->hook_entry,
                       sizeof(info.hook_entry));
@@ -1006,7 +994,7 @@ static int __do_replace(struct net *net, const char *name,
        struct arpt_entry *iter;
        ret = 0;
-        counters = vmalloc(num_counters * sizeof(struct xt_counters));
+        counters = vzalloc(num_counters * sizeof(struct xt_counters));
        if (!counters) {
                ret = -ENOMEM;
                goto out;
@@ -1081,6 +1069,7 @@ static int do_replace(struct net *net, const void __user *user,
        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
+        tmp.name[sizeof(tmp.name)-1] = 0;
        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
@@ -1129,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user,
        int ret = 0;
        void *loc_cpu_entry;
        struct arpt_entry *iter;
+        unsigned int addend;
 #ifdef CONFIG_COMPAT
        struct compat_xt_counters_info compat_tmp;
@@ -1185,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user,
        /* Choose the copy that is on our node */
        curcpu = smp_processor_id();
        loc_cpu_entry = private->entries[curcpu];
-        xt_info_wrlock(curcpu);
+        addend = xt_write_recseq_begin();
        xt_entry_foreach(iter, loc_cpu_entry, private->size) {
                ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
                ++i;
        }
-        xt_info_wrunlock(curcpu);
+        xt_write_recseq_end(addend);
 unlock_up_free:
        local_bh_enable();
        xt_table_unlock(t);
@@ -1204,7 +1194,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 #ifdef CONFIG_COMPAT
 static inline void compat_release_entry(struct compat_arpt_entry *e)
 {
-        struct arpt_entry_target *t;
+        struct xt_entry_target *t;
        t = compat_arpt_get_target(e);
        module_put(t->u.kernel.target->me);
@@ -1220,7 +1210,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
                                  const unsigned int *underflows,
                                  const char *name)
 {
-        struct arpt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        int ret, off, h;
@@ -1288,7 +1278,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
                            unsigned int *size, const char *name,
                            struct xt_table_info *newinfo, unsigned char *base)
 {
-        struct arpt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_target *target;
        struct arpt_entry *de;
        unsigned int origsize;
@@ -1349,6 +1339,7 @@ static int translate_compat_table(const char *name,
        duprintf("translate_compat_table: size %u\n", info->size);
        j = 0;
        xt_compat_lock(NFPROTO_ARP);
+        xt_compat_init_offsets(NFPROTO_ARP, number);
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter0, entry0, total_size) {
                ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1474,7 +1465,7 @@ out_unlock:
 }
 struct compat_arpt_replace {
-        char                            name[ARPT_TABLE_MAXNAMELEN];
+        char                            name[XT_TABLE_MAXNAMELEN];
        u32                             valid_hooks;
        u32                             num_entries;
        u32                             size;
@@ -1502,6 +1493,7 @@ static int compat_do_replace(struct net *net, void __user *user,
                return -ENOMEM;
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
+        tmp.name[sizeof(tmp.name)-1] = 0;
        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
@@ -1567,7 +1559,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
                                     struct xt_counters *counters,
                                     unsigned int i)
 {
-        struct arpt_entry_target *t;
+        struct xt_entry_target *t;
        struct compat_arpt_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
@@ -1628,7 +1620,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 }
 struct compat_arpt_get_entries {
-        char name[ARPT_TABLE_MAXNAMELEN];
+        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_arpt_entry entrytable[0];
 };
@@ -1754,6 +1746,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
                        ret = -EFAULT;
                        break;
                }
+                rev.name[sizeof(rev.name)-1] = 0;
                try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
                                                         rev.revision, 1, &ret),
@@ -1828,7 +1821,7 @@ void arpt_unregister_table(struct xt_table *table)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target arpt_builtin_tg[] __read_mostly = {
        {
-                .name             = ARPT_STANDARD_TARGET,
+                .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_ARP,
 #ifdef CONFIG_COMPAT
@@ -1838,9 +1831,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
 #endif
        },
        {
-                .name             = ARPT_ERROR_TARGET,
+                .name             = XT_ERROR_TARGET,
                .target           = arpt_error,
-                .targetsize       = ARPT_FUNCTION_MAXNAMELEN,
+                .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_ARP,
        },
 };
@@ -1885,7 +1878,7 @@ static int __init arp_tables_init(void)
        if (ret < 0)
                goto err1;
-        /* Noone else will be downing sem now, so we won't sleep */
+        /* No one else will be downing sem now, so we won't sleep */
        ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
        if (ret < 0)
                goto err2;
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171b..a5e52a9f0a12 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)
        if (mangle->flags & ~ARPT_MANGLE_MASK ||
            !(mangle->flags & ARPT_MANGLE_MASK))
-                return false;
+                return -EINVAL;
        if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
-           mangle->target != ARPT_CONTINUE)
+           mangle->target != XT_CONTINUE)
-                return false;
+                return -EINVAL;
-        return true;
+        return 0;
 }
 static struct xt_target arpt_mangle_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index d2c1311cb28d..5c9b9d963918 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -203,7 +203,8 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
        else
                pmsg->outdev_name[0] = '\0';
-        if (entry->indev && entry->skb->dev) {
+        if (entry->indev && entry->skb->dev &&
+            entry->skb->mac_header != entry->skb->network_header) {
                pmsg->hw_type = entry->skb->dev->type;
                pmsg->hw_addrlen = dev_parse_header(entry->skb,
                                                    pmsg->hw_addr);
@@ -402,7 +403,8 @@ ipq_dev_drop(int ifindex)
 static inline void
 __ipq_rcv_skb(struct sk_buff *skb)
 {
-        int status, type, pid, flags, nlmsglen, skblen;
+        int status, type, pid, flags;
+        unsigned int nlmsglen, skblen;
        struct nlmsghdr *nlh;
        skblen = skb->len;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d163f2e3b2e9..24e556e83a3b 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
 }
 EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
-/*
-   We keep a set of rules for each CPU, so we can avoid write-locking
-   them in the softirq when updating the counters and therefore
-   only need to read-lock in the softirq; doing a write_lock_bh() in user
-   context stops packets coming through and allows user context to read
-   the counters or update the rules.
-   Hence the start of any table is given by get_table() below.  */
 /* Returns whether matches rule or not. */
 /* Performance critical - called for every packet */
 static inline bool
@@ -186,7 +177,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
 }
 /* for const-correctness */
-static inline const struct ipt_entry_target *
+static inline const struct xt_entry_target *
 ipt_get_target_c(const struct ipt_entry *e)
 {
        return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +221,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
 {
-        const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
+        const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
-        if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+        if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
@@ -241,7 +232,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
                if (s->target_offset == sizeof(struct ipt_entry) &&
                    strcmp(t->target.u.kernel.target->name,
-                           IPT_STANDARD_TARGET) == 0 &&
+                           XT_STANDARD_TARGET) == 0 &&
                   t->verdict < 0 &&
                   unconditional(&s->ip)) {
                        /* Tail of chains: STANDARD target (return/policy) */
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
        unsigned int *stackptr, origptr, cpu;
        const struct xt_table_info *private;
        struct xt_action_param acpar;
+        unsigned int addend;
        /* Initialization */
        ip = ip_hdr(skb);
@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb,
        acpar.hooknum = hook;
        IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-        xt_info_rdlock_bh();
+        local_bh_disable();
+        addend = xt_write_recseq_begin();
        private = table->private;
        cpu        = smp_processor_id();
        table_base = private->entries[cpu];
@@ -346,7 +339,7 @@ ipt_do_table(struct sk_buff *skb,
                 get_entry(table_base, private->underflow[hook]));
        do {
-                const struct ipt_entry_target *t;
+                const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
                IP_NF_ASSERT(e);
@@ -380,14 +373,14 @@ ipt_do_table(struct sk_buff *skb,
                if (!t->u.kernel.target->target) {
                        int v;
-                        v = ((struct ipt_standard_target *)t)->verdict;
+                        v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
-                                if (v != IPT_RETURN) {
+                                if (v != XT_RETURN) {
                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
-                                if (*stackptr == 0) {
+                                if (*stackptr <= origptr) {
                                        e = get_entry(table_base,
                                            private->underflow[hook]);
                                        pr_debug("Underflow (this is normal) "
@@ -421,16 +414,18 @@ ipt_do_table(struct sk_buff *skb,
                verdict = t->u.kernel.target->target(skb, &acpar);
                /* Target might have changed stuff. */
                ip = ip_hdr(skb);
-                if (verdict == IPT_CONTINUE)
+                if (verdict == XT_CONTINUE)
                        e = ipt_next_entry(e);
                else
                        /* Verdict */
                        break;
        } while (!acpar.hotdrop);
-        xt_info_rdunlock_bh();
        pr_debug("Exiting %s; resetting sp from %u to %u\n",
                 __func__, *stackptr, origptr);
        *stackptr = origptr;
+        xt_write_recseq_end(addend);
+        local_bh_enable();
 #ifdef DEBUG_ALLOW_ALL
        return NF_ACCEPT;
 #else
@@ -461,7 +456,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
                e->counters.pcnt = pos;
                for (;;) {
-                        const struct ipt_standard_target *t
+                        const struct xt_standard_target *t
                                = (void *)ipt_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);
@@ -475,13 +470,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
                        /* Unconditional return/END. */
                        if ((e->target_offset == sizeof(struct ipt_entry) &&
                             (strcmp(t->target.u.user.name,
-                                     IPT_STANDARD_TARGET) == 0) &&
+                                     XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0 && unconditional(&e->ip)) ||
                            visited) {
                                unsigned int oldpos, size;
                                if ((strcmp(t->target.u.user.name,
-                                            IPT_STANDARD_TARGET) == 0) &&
+                                            XT_STANDARD_TARGET) == 0) &&
                                    t->verdict < -NF_MAX_VERDICT - 1) {
                                        duprintf("mark_source_chains: bad "
                                                "negative verdict (%i)\n",
@@ -524,7 +519,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
                                int newpos = t->verdict;
                                if (strcmp(t->target.u.user.name,
-                                           IPT_STANDARD_TARGET) == 0 &&
+                                           XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        if (newpos > newinfo->size -
                                                sizeof(struct ipt_entry)) {
@@ -552,7 +547,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
        return 1;
 }
-static void cleanup_match(struct ipt_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
        struct xt_mtdtor_param par;
@@ -568,14 +563,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e, const char *name)
 {
-        const struct ipt_entry_target *t;
+        const struct xt_entry_target *t;
        if (!ip_checkentry(&e->ip)) {
-                duprintf("ip check failed %p %s.\n", e, par->match->name);
+                duprintf("ip check failed %p %s.\n", e, name);
                return -EINVAL;
        }
-        if (e->target_offset + sizeof(struct ipt_entry_target) >
+        if (e->target_offset + sizeof(struct xt_entry_target) >
            e->next_offset)
                return -EINVAL;
@@ -587,7 +582,7 @@ check_entry(const struct ipt_entry *e, const char *name)
 }
 static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
        const struct ipt_ip *ip = par->entryinfo;
        int ret;
@@ -605,7 +600,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
 }
 static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
        struct xt_match *match;
        int ret;
@@ -630,7 +625,7 @@ err:
 static int check_target(struct ipt_entry *e, struct net *net, const char *name)
 {
-        struct ipt_entry_target *t = ipt_get_target(e);
+        struct xt_entry_target *t = ipt_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
@@ -656,7 +651,7 @@ static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
                 unsigned int size)
 {
-        struct ipt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
@@ -707,7 +702,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 static bool check_underflow(const struct ipt_entry *e)
 {
-        const struct ipt_entry_target *t;
+        const struct xt_entry_target *t;
        unsigned int verdict;
        if (!unconditional(&e->ip))
@@ -715,7 +710,7 @@ static bool check_underflow(const struct ipt_entry *e)
        t = ipt_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
-        verdict = ((struct ipt_standard_target *)t)->verdict;
+        verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -738,7 +733,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
        }
        if (e->next_offset
-            < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+            < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
                duprintf("checking: element %p size %u\n",
                         e, e->next_offset);
                return -EINVAL;
@@ -771,7 +766,7 @@ static void
 cleanup_entry(struct ipt_entry *e, struct net *net)
 {
        struct xt_tgdtor_param par;
-        struct ipt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_entry_match *ematch;
        /* Cleanup all matches */
@@ -884,42 +879,25 @@ get_counters(const struct xt_table_info *t,
        struct ipt_entry *iter;
        unsigned int cpu;
        unsigned int i;
-        unsigned int curcpu = get_cpu();
-        /* Instead of clearing (by a previous call to memset())
-         * the counters and using adds, we set the counters
-         * with data used by 'current' CPU.
-         *
-         * Bottom half has to be disabled to prevent deadlock
-         * if new softirq were to run and call ipt_do_table
-         */
-        local_bh_disable();
-        i = 0;
-        xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-                SET_COUNTER(counters[i], iter->counters.bcnt,
-                            iter->counters.pcnt);
-                ++i;
-        }
-        local_bh_enable();
-        /* Processing counters from other cpus, we can let bottom half enabled,
-         * (preemption is disabled)
-         */
        for_each_possible_cpu(cpu) {
-                if (cpu == curcpu)
+                seqcount_t *s = &per_cpu(xt_recseq, cpu);
-                        continue;
                i = 0;
-                local_bh_disable();
-                xt_info_wrlock(cpu);
                xt_entry_foreach(iter, t->entries[cpu], t->size) {
-                        ADD_COUNTER(counters[i], iter->counters.bcnt,
+                        u64 bcnt, pcnt;
-                                    iter->counters.pcnt);
+                        unsigned int start;
+                        do {
+                                start = read_seqcount_begin(s);
+                                bcnt = iter->counters.bcnt;
+                                pcnt = iter->counters.pcnt;
+                        } while (read_seqcount_retry(s, start));
+                        ADD_COUNTER(counters[i], bcnt, pcnt);
                        ++i; /* macro does multi eval of i */
                }
-                xt_info_wrunlock(cpu);
-                local_bh_enable();
        }
-        put_cpu();
 }
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +910,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
           (other than comefrom, which userspace doesn't care
           about). */
        countersize = sizeof(struct xt_counters) * private->number;
-        counters = vmalloc(countersize);
+        counters = vzalloc(countersize);
        if (counters == NULL)
                return ERR_PTR(-ENOMEM);
@@ -972,8 +950,8 @@ copy_entries_to_user(unsigned int total_size,
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
-                const struct ipt_entry_match *m;
+                const struct xt_entry_match *m;
-                const struct ipt_entry_target *t;
+                const struct xt_entry_target *t;
                e = (struct ipt_entry *)(loc_cpu_entry + off);
                if (copy_to_user(userptr + off
@@ -990,7 +968,7 @@ copy_entries_to_user(unsigned int total_size,
                        m = (void *)e + i;
                        if (copy_to_user(userptr + off + i
-                                         + offsetof(struct ipt_entry_match,
+                                         + offsetof(struct xt_entry_match,
                                                    u.user.name),
                                         m->u.kernel.match->name,
                                         strlen(m->u.kernel.match->name)+1)
@@ -1002,7 +980,7 @@ copy_entries_to_user(unsigned int total_size,
                t = ipt_get_target_c(e);
                if (copy_to_user(userptr + off + e->target_offset
-                                 + offsetof(struct ipt_entry_target,
+                                 + offsetof(struct xt_entry_target,
                                            u.user.name),
                                 t->u.kernel.target->name,
                                 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1040,7 +1018,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
                             const void *base, struct xt_table_info *newinfo)
 {
        const struct xt_entry_match *ematch;
-        const struct ipt_entry_target *t;
+        const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;
@@ -1080,6 +1058,7 @@ static int compat_table_info(const struct xt_table_info *info,
        memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
        newinfo->initial_entries = 0;
        loc_cpu_entry = info->entries[raw_smp_processor_id()];
+        xt_compat_init_offsets(AF_INET, info->number);
        xt_entry_foreach(iter, loc_cpu_entry, info->size) {
                ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
                if (ret != 0)
@@ -1092,7 +1071,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                    const int *len, int compat)
 {
-        char name[IPT_TABLE_MAXNAMELEN];
+        char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;
@@ -1105,7 +1084,7 @@ static int get_info(struct net *net, void __user *user,
        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;
-        name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+        name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
        if (compat)
                xt_compat_lock(AF_INET);
@@ -1124,6 +1103,7 @@ static int get_info(struct net *net, void __user *user,
                        private = &tmp;
                }
 #endif
+                memset(&info, 0, sizeof(info));
                info.valid_hooks = t->valid_hooks;
                memcpy(info.hook_entry, private->hook_entry,
                       sizeof(info.hook_entry));
@@ -1202,7 +1182,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
        struct ipt_entry *iter;
        ret = 0;
-        counters = vmalloc(num_counters * sizeof(struct xt_counters));
+        counters = vzalloc(num_counters * sizeof(struct xt_counters));
        if (!counters) {
                ret = -ENOMEM;
                goto out;
@@ -1277,6 +1257,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
+        tmp.name[sizeof(tmp.name)-1] = 0;
        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
@@ -1326,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user,
        int ret = 0;
        void *loc_cpu_entry;
        struct ipt_entry *iter;
+        unsigned int addend;
 #ifdef CONFIG_COMPAT
        struct compat_xt_counters_info compat_tmp;
@@ -1382,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user,
        /* Choose the copy that is on our node */
        curcpu = smp_processor_id();
        loc_cpu_entry = private->entries[curcpu];
-        xt_info_wrlock(curcpu);
+        addend = xt_write_recseq_begin();
        xt_entry_foreach(iter, loc_cpu_entry, private->size) {
                ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
                ++i;
        }
-        xt_info_wrunlock(curcpu);
+        xt_write_recseq_end(addend);
 unlock_up_free:
        local_bh_enable();
        xt_table_unlock(t);
@@ -1400,14 +1382,14 @@ do_add_counters(struct net *net, const void __user *user,
 #ifdef CONFIG_COMPAT
 struct compat_ipt_replace {
-        char                    name[IPT_TABLE_MAXNAMELEN];
+        char                    name[XT_TABLE_MAXNAMELEN];
        u32                     valid_hooks;
        u32                     num_entries;
        u32                     size;
        u32                     hook_entry[NF_INET_NUMHOOKS];
        u32                     underflow[NF_INET_NUMHOOKS];
        u32                     num_counters;
-        compat_uptr_t           counters;       /* struct ipt_counters * */
+        compat_uptr_t           counters;       /* struct xt_counters * */
        struct compat_ipt_entry entries[0];
 };
@@ -1416,7 +1398,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
 {
-        struct ipt_entry_target *t;
+        struct xt_entry_target *t;
        struct compat_ipt_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
@@ -1451,7 +1433,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 }
 static int
-compat_find_calc_match(struct ipt_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
                       const char *name,
                       const struct ipt_ip *ip,
                       unsigned int hookmask,
@@ -1473,7 +1455,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
 static void compat_release_entry(struct compat_ipt_entry *e)
 {
-        struct ipt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_entry_match *ematch;
        /* Cleanup all matches */
@@ -1494,7 +1476,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
                                  const char *name)
 {
        struct xt_entry_match *ematch;
-        struct ipt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
@@ -1576,7 +1558,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
                            unsigned int *size, const char *name,
                            struct xt_table_info *newinfo, unsigned char *base)
 {
-        struct ipt_entry_target *t;
+        struct xt_entry_target *t;
        struct xt_target *target;
        struct ipt_entry *de;
        unsigned int origsize;
@@ -1680,6 +1662,7 @@ translate_compat_table(struct net *net,
        duprintf("translate_compat_table: size %u\n", info->size);
        j = 0;
        xt_compat_lock(AF_INET);
+        xt_compat_init_offsets(AF_INET, number);
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter0, entry0, total_size) {
                ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1821,6 +1804,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
                return -ENOMEM;
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
+        tmp.name[sizeof(tmp.name)-1] = 0;
        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
@@ -1884,7 +1868,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 }
 struct compat_ipt_get_entries {
-        char name[IPT_TABLE_MAXNAMELEN];
+        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ipt_entry entrytable[0];
 };
@@ -2039,7 +2023,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
        case IPT_SO_GET_REVISION_MATCH:
        case IPT_SO_GET_REVISION_TARGET: {
-                struct ipt_get_revision rev;
+                struct xt_get_revision rev;
                int target;
                if (*len != sizeof(rev)) {
@@ -2050,6 +2034,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
                        ret = -EFAULT;
                        break;
                }
+                rev.name[sizeof(rev.name)-1] = 0;
                if (cmd == IPT_SO_GET_REVISION_TARGET)
                        target = 1;
@@ -2176,7 +2161,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
 static struct xt_target ipt_builtin_tg[] __read_mostly = {
        {
-                .name             = IPT_STANDARD_TARGET,
+                .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV4,
 #ifdef CONFIG_COMPAT
@@ -2186,9 +2171,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
 #endif
        },
        {
-                .name             = IPT_ERROR_TARGET,
+                .name             = XT_ERROR_TARGET,
                .target           = ipt_error,
-                .targetsize       = IPT_FUNCTION_MAXNAMELEN,
+                .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV4,
        },
 };
@@ -2244,7 +2229,7 @@ static int __init ip_tables_init(void)
        if (ret < 0)
                goto err1;
-        /* Noone else will be downing sem now, so we won't sleep */
+        /* No one else will be downing sem now, so we won't sleep */
        ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
        if (ret < 0)
                goto err2;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db87..5c9e97c79017 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/net_namespace.h>
 #include <net/checksum.h>
+#include <net/ip.h>
 #define CLUSTERIP_VERSION "0.8"
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
 {
        const struct iphdr *iph = ip_hdr(skb);
        unsigned long hashval;
-        u_int16_t sport, dport;
+        u_int16_t sport = 0, dport = 0;
-        const u_int16_t *ports;
+        int poff;
-        switch (iph->protocol) {
+        poff = proto_ports_offset(iph->protocol);
-        case IPPROTO_TCP:
+        if (poff >= 0) {
-        case IPPROTO_UDP:
+                const u_int16_t *ports;
-        case IPPROTO_UDPLITE:
+                u16 _ports[2];
-        case IPPROTO_SCTP:
-        case IPPROTO_DCCP:
+                ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
-        case IPPROTO_ICMP:
+                if (ports) {
-                ports = (const void *)iph+iph->ihl*4;
+                        sport = ports[0];
-                sport = ports[0];
+                        dport = ports[1];
-                dport = ports[1];
+                }
-                break;
+        } else {
-        default:
                if (net_ratelimit())
                        pr_info("unknown protocol %u\n", iph->protocol);
-                sport = dport = 0;
        }
        switch (config->hash_mode) {
@@ -301,19 +300,14 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
         * that the ->target() function isn't called after ->destroy() */
        ct = nf_ct_get(skb, &ctinfo);
-        if (ct == NULL) {
+        if (ct == NULL)
-                pr_info("no conntrack!\n");
-                        /* FIXME: need to drop invalid ones, since replies
-                         * to outgoing connections of other nodes will be
-                         * marked as INVALID */
                return NF_DROP;
-        }
        /* special case: ICMP error handling. conntrack distinguishes between
         * error messages (RELATED) and information requests (see below) */
        if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
            (ctinfo == IP_CT_RELATED ||
-             ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
+             ctinfo == IP_CT_RELATED_REPLY))
                return XT_CONTINUE;
        /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -327,12 +321,12 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
                        ct->mark = hash;
                        break;
                case IP_CT_RELATED:
-                case IP_CT_RELATED+IP_CT_IS_REPLY:
+                case IP_CT_RELATED_REPLY:
                        /* FIXME: we don't handle expectations at the
                         * moment.  they can arrive on a different node than
                         * the master connection (e.g. FTP passive mode) */
                case IP_CT_ESTABLISHED:
-                case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+                case IP_CT_ESTABLISHED_REPLY:
                        break;
                default:
                        break;
@@ -670,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
        char buffer[PROC_WRITELEN+1];
        unsigned long nodenum;
-        if (copy_from_user(buffer, input, PROC_WRITELEN))
+        if (size > PROC_WRITELEN)
+                return -EIO;
+        if (copy_from_user(buffer, input, size))
                return -EFAULT;
+        buffer[size] = 0;
        if (*buffer == '+') {
                nodenum = simple_strtoul(buffer+1, NULL, 10);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17d7ce2..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ipt_LOG.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+                        const struct nf_loginfo *info,
                        const struct sk_buff *skb,
                        unsigned int iphoff)
 {
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
        ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
        if (ih == NULL) {
-                printk("TRUNCATED");
+                sb_add(m, "TRUNCATED");
                return;
        }
        /* Important fields:
         * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
        /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
-        printk("SRC=%pI4 DST=%pI4 ",
+        sb_add(m, "SRC=%pI4 DST=%pI4 ",
               &ih->saddr, &ih->daddr);
        /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
-        printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+        sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
               ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
               ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
        /* Max length: 6 "CE DF MF " */
        if (ntohs(ih->frag_off) & IP_CE)
-                printk("CE ");
+                sb_add(m, "CE ");
        if (ntohs(ih->frag_off) & IP_DF)
-                printk("DF ");
+                sb_add(m, "DF ");
        if (ntohs(ih->frag_off) & IP_MF)
-                printk("MF ");
+                sb_add(m, "MF ");
        /* Max length: 11 "FRAG:65535 " */
        if (ntohs(ih->frag_off) & IP_OFFSET)
-                printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+                sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
        if ((logflags & IPT_LOG_IPOPT) &&
            ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
                op = skb_header_pointer(skb, iphoff+sizeof(_iph),
                                        optsize, _opt);
                if (op == NULL) {
-                        printk("TRUNCATED");
+                        sb_add(m, "TRUNCATED");
                        return;
                }
                /* Max length: 127 "OPT (" 15*4*2chars ") " */
-                printk("OPT (");
+                sb_add(m, "OPT (");
                for (i = 0; i < optsize; i++)
-                        printk("%02X", op[i]);
+                        sb_add(m, "%02X", op[i]);
-                printk(") ");
+                sb_add(m, ") ");
        }
        switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
                const struct tcphdr *th;
                /* Max length: 10 "PROTO=TCP " */
-                printk("PROTO=TCP ");
+                sb_add(m, "PROTO=TCP ");
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
                th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
                                        sizeof(_tcph), &_tcph);
                if (th == NULL) {
-                        printk("INCOMPLETE [%u bytes] ",
+                        sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
                /* Max length: 20 "SPT=65535 DPT=65535 " */
-                printk("SPT=%u DPT=%u ",
+                sb_add(m, "SPT=%u DPT=%u ",
                       ntohs(th->source), ntohs(th->dest));
                /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
                if (logflags & IPT_LOG_TCPSEQ)
-                        printk("SEQ=%u ACK=%u ",
+                        sb_add(m, "SEQ=%u ACK=%u ",
                               ntohl(th->seq), ntohl(th->ack_seq));
                /* Max length: 13 "WINDOW=65535 " */
-                printk("WINDOW=%u ", ntohs(th->window));
+                sb_add(m, "WINDOW=%u ", ntohs(th->window));
                /* Max length: 9 "RES=0x3F " */
-                printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+                sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
                /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
                if (th->cwr)
-                        printk("CWR ");
+                        sb_add(m, "CWR ");
                if (th->ece)
-                        printk("ECE ");
+                        sb_add(m, "ECE ");
                if (th->urg)
-                        printk("URG ");
+                        sb_add(m, "URG ");
                if (th->ack)
-                        printk("ACK ");
+                        sb_add(m, "ACK ");
                if (th->psh)
-                        printk("PSH ");
+                        sb_add(m, "PSH ");
                if (th->rst)
-                        printk("RST ");
+                        sb_add(m, "RST ");
                if (th->syn)
-                        printk("SYN ");
+                        sb_add(m, "SYN ");
                if (th->fin)
-                        printk("FIN ");
+                        sb_add(m, "FIN ");
                /* Max length: 11 "URGP=65535 " */
-                printk("URGP=%u ", ntohs(th->urg_ptr));
+                sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
                if ((logflags & IPT_LOG_TCPOPT) &&
                    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
                                                iphoff+ih->ihl*4+sizeof(_tcph),
                                                optsize, _opt);
                        if (op == NULL) {
-                                printk("TRUNCATED");
+                                sb_add(m, "TRUNCATED");
                                return;
                        }
                        /* Max length: 127 "OPT (" 15*4*2chars ") " */
-                        printk("OPT (");
+                        sb_add(m, "OPT (");
                        for (i = 0; i < optsize; i++)
-                                printk("%02X", op[i]);
+                                sb_add(m, "%02X", op[i]);
-                        printk(") ");
+                        sb_add(m, ") ");
                }
                break;
        }
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
                if (ih->protocol == IPPROTO_UDP)
                        /* Max length: 10 "PROTO=UDP "     */
-                        printk("PROTO=UDP " );
+                        sb_add(m, "PROTO=UDP " );
                else    /* Max length: 14 "PROTO=UDPLITE " */
-                        printk("PROTO=UDPLITE ");
+                        sb_add(m, "PROTO=UDPLITE ");
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
                uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
                                        sizeof(_udph), &_udph);
                if (uh == NULL) {
-                        printk("INCOMPLETE [%u bytes] ",
+                        sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
                /* Max length: 20 "SPT=65535 DPT=65535 " */
-                printk("SPT=%u DPT=%u LEN=%u ",
+                sb_add(m, "SPT=%u DPT=%u LEN=%u ",
                       ntohs(uh->source), ntohs(uh->dest),
                       ntohs(uh->len));
                break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
                            [ICMP_ADDRESSREPLY] = 12 };
                /* Max length: 11 "PROTO=ICMP " */
-                printk("PROTO=ICMP ");
+                sb_add(m, "PROTO=ICMP ");
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
                ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
                                         sizeof(_icmph), &_icmph);
                if (ich == NULL) {
-                        printk("INCOMPLETE [%u bytes] ",
+                        sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
                /* Max length: 18 "TYPE=255 CODE=255 " */
-                printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+                sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                if (ich->type <= NR_ICMP_TYPES &&
                    required_len[ich->type] &&
                    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
-                        printk("INCOMPLETE [%u bytes] ",
+                        sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
                case ICMP_ECHOREPLY:
                case ICMP_ECHO:
                        /* Max length: 19 "ID=65535 SEQ=65535 " */
-                        printk("ID=%u SEQ=%u ",
+                        sb_add(m, "ID=%u SEQ=%u ",
                               ntohs(ich->un.echo.id),
                               ntohs(ich->un.echo.sequence));
                        break;
                case ICMP_PARAMETERPROB:
                        /* Max length: 14 "PARAMETER=255 " */
-                        printk("PARAMETER=%u ",
+                        sb_add(m, "PARAMETER=%u ",
                               ntohl(ich->un.gateway) >> 24);
                        break;
                case ICMP_REDIRECT:
                        /* Max length: 24 "GATEWAY=255.255.255.255 " */
-                        printk("GATEWAY=%pI4 ", &ich->un.gateway);
+                        sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
                        /* Fall through */
                case ICMP_DEST_UNREACH:
                case ICMP_SOURCE_QUENCH:
                case ICMP_TIME_EXCEEDED:
                        /* Max length: 3+maxlen */
                        if (!iphoff) { /* Only recurse once. */
-                                printk("[");
+                                sb_add(m, "[");
-                                dump_packet(info, skb,
+                                dump_packet(m, info, skb,
                                            iphoff + ih->ihl*4+sizeof(_icmph));
-                                printk("] ");
+                                sb_add(m, "] ");
                        }
                        /* Max length: 10 "MTU=65535 " */
                        if (ich->type == ICMP_DEST_UNREACH &&
                            ich->code == ICMP_FRAG_NEEDED)
-                                printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+                                sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
                }
                break;
        }
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
                        break;
                /* Max length: 9 "PROTO=AH " */
-                printk("PROTO=AH ");
+                sb_add(m, "PROTO=AH ");
                /* Max length: 25 "INCOMPLETE [65535 bytes] " */
                ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
                                        sizeof(_ahdr), &_ahdr);
                if (ah == NULL) {
-                        printk("INCOMPLETE [%u bytes] ",
+                        sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
                /* Length: 15 "SPI=0xF1234567 " */
-                printk("SPI=0x%x ", ntohl(ah->spi));
+                sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
                break;
        }
        case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
                const struct ip_esp_hdr *eh;
                /* Max length: 10 "PROTO=ESP " */
-                printk("PROTO=ESP ");
+                sb_add(m, "PROTO=ESP ");
                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
                eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
                                        sizeof(_esph), &_esph);
                if (eh == NULL) {
-                        printk("INCOMPLETE [%u bytes] ",
+                        sb_add(m, "INCOMPLETE [%u bytes] ",
                               skb->len - iphoff - ih->ihl*4);
                        break;
                }
                /* Length: 15 "SPI=0xF1234567 " */
-                printk("SPI=0x%x ", ntohl(eh->spi));
+                sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
                break;
        }
        /* Max length: 10 "PROTO 255 " */
        default:
-                printk("PROTO=%u ", ih->protocol);
+                sb_add(m, "PROTO=%u ", ih->protocol);
        }
        /* Max length: 15 "UID=4294967295 " */
        if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
                read_lock_bh(&skb->sk->sk_callback_lock);
                if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-                        printk("UID=%u GID=%u ",
+                        sb_add(m, "UID=%u GID=%u ",
                                skb->sk->sk_socket->file->f_cred->fsuid,
                                skb->sk->sk_socket->file->f_cred->fsgid);
                read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
        /* Max length: 16 "MARK=0xFFFFFFFF " */
        if (!iphoff && skb->mark)
-                printk("MARK=0x%x ", skb->mark);
+                sb_add(m, "MARK=0x%x ", skb->mark);
        /* Proto    Max log string length */
        /* IP:      40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
        /* maxlen = 230+   91  + 230 + 252 = 803 */
 }
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+                            const struct nf_loginfo *info,
                            const struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
        switch (dev->type) {
        case ARPHRD_ETHER:
-                printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+                sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
                       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
                       ntohs(eth_hdr(skb)->h_proto));
                return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
        }
 fallback:
-        printk("MAC=");
+        sb_add(m, "MAC=");
        if (dev->hard_header_len &&
            skb->mac_header != skb->network_header) {
                const unsigned char *p = skb_mac_header(skb);
                unsigned int i;
-                printk("%02x", *p++);
+                sb_add(m, "%02x", *p++);
                for (i = 1; i < dev->hard_header_len; i++, p++)
-                        printk(":%02x", *p);
+                        sb_add(m, ":%02x", *p);
        }
-        printk(" ");
+        sb_add(m, " ");
 }
 static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
               const struct nf_loginfo *loginfo,
               const char *prefix)
 {
+        struct sbuff *m = sb_open();
        if (!loginfo)
                loginfo = &default_loginfo;
-        spin_lock_bh(&log_lock);
+        sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
-        printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
               prefix,
               in ? in->name : "",
               out ? out->name : "");
@@ -434,20 +435,19 @@ ipt_log_packet(u_int8_t pf,
                physindev = skb->nf_bridge->physindev;
                if (physindev && in != physindev)
-                        printk("PHYSIN=%s ", physindev->name);
+                        sb_add(m, "PHYSIN=%s ", physindev->name);
                physoutdev = skb->nf_bridge->physoutdev;
                if (physoutdev && out != physoutdev)
-                        printk("PHYSOUT=%s ", physoutdev->name);
+                        sb_add(m, "PHYSOUT=%s ", physoutdev->name);
        }
 #endif
-        /* MAC logging for input path only. */
+        if (in != NULL)
-        if (in && !out)
+                dump_mac_header(m, loginfo, skb);
-                dump_mac_header(loginfo, skb);
+        dump_packet(m, loginfo, skb, 0);
-        dump_packet(loginfo, skb, 0);
+        sb_close(m);
-        printk("\n");
-        spin_unlock_bh(&log_lock);
 }
 static unsigned int
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index d2ed9dc74ebc..9931152a78b5 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        nat = nfct_nat(ct);
        NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-                            ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+                            ctinfo == IP_CT_RELATED_REPLY));
        /* Source address is 0.0.0.0 - locally generated packet that is
         * probably not supposed to be masqueraded.
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 43eec80c0e7c..51f13f8ec724 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -40,7 +40,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        struct iphdr *niph;
        const struct tcphdr *oth;
        struct tcphdr _otcph, *tcph;
-        unsigned int addr_type;
        /* IP header checks: fragment. */
        if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
@@ -55,6 +54,9 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        if (oth->rst)
                return;
+        if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+                return;
        /* Check checksum */
        if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
                return;
@@ -101,22 +103,14 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        nskb->csum_start = (unsigned char *)tcph - nskb->head;
        nskb->csum_offset = offsetof(struct tcphdr, check);
-        addr_type = RTN_UNSPEC;
-        if (hook != NF_INET_FORWARD
-#ifdef CONFIG_BRIDGE_NETFILTER
-            || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED)
-#endif
-           )
-                addr_type = RTN_LOCAL;
        /* ip_route_me_harder expects skb->dst to be set */
        skb_dst_set_noref(nskb, skb_dst(oldskb));
        nskb->protocol = htons(ETH_P_IP);
-        if (ip_route_me_harder(nskb, addr_type))
+        if (ip_route_me_harder(nskb, RTN_UNSPEC))
                goto free_nskb;
-        niph->ttl       = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
+        niph->ttl       = ip4_dst_hoplimit(skb_dst(nskb));
        /* "Never happens" */
        if (nskb->len > dst_mtu(skb_dst(nskb)))
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index db8bff0fb86d..000000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- *  iptables module to match inet_addr_type() of an ip.
- *
- *  Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
- *  (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/ip.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4/ipt_addrtype.h>
-#include <linux/netfilter/x_tables.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("Xtables: address type match for IPv4");
-static inline bool match_type(struct net *net, const struct net_device *dev,
-                              __be32 addr, u_int16_t mask)
-{
-        return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
-}
-static bool
-addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
-{
-        struct net *net = dev_net(par->in ? par->in : par->out);
-        const struct ipt_addrtype_info *info = par->matchinfo;
-        const struct iphdr *iph = ip_hdr(skb);
-        bool ret = true;
-        if (info->source)
-                ret &= match_type(net, NULL, iph->saddr, info->source) ^
-                       info->invert_source;
-        if (info->dest)
-                ret &= match_type(net, NULL, iph->daddr, info->dest) ^
-                       info->invert_dest;
-        return ret;
-}
-static bool
-addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
-{
-        struct net *net = dev_net(par->in ? par->in : par->out);
-        const struct ipt_addrtype_info_v1 *info = par->matchinfo;
-        const struct iphdr *iph = ip_hdr(skb);
-        const struct net_device *dev = NULL;
-        bool ret = true;
-        if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
-                dev = par->in;
-        else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
-                dev = par->out;
-        if (info->source)
-                ret &= match_type(net, dev, iph->saddr, info->source) ^
-                       (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
-        if (ret && info->dest)
-                ret &= match_type(net, dev, iph->daddr, info->dest) ^
-                       !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
-        return ret;
-}
-static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
-{
-        struct ipt_addrtype_info_v1 *info = par->matchinfo;
-        if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
-            info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
-                pr_info("both incoming and outgoing "
-                        "interface limitation cannot be selected\n");
-                return -EINVAL;
-        }
-        if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
-            (1 << NF_INET_LOCAL_IN)) &&
-            info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
-                pr_info("output interface limitation "
-                        "not valid in PREROUTING and INPUT\n");
-                return -EINVAL;
-        }
-        if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
-            (1 << NF_INET_LOCAL_OUT)) &&
-            info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
-                pr_info("input interface limitation "
-                        "not valid in POSTROUTING and OUTPUT\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-static struct xt_match addrtype_mt_reg[] __read_mostly = {
-        {
-                .name           = "addrtype",
-                .family         = NFPROTO_IPV4,
-                .match          = addrtype_mt_v0,
-                .matchsize      = sizeof(struct ipt_addrtype_info),
-                .me             = THIS_MODULE
-        },
-        {
-                .name           = "addrtype",
-                .family         = NFPROTO_IPV4,
-                .revision       = 1,
-                .match          = addrtype_mt_v1,
-                .checkentry     = addrtype_mt_checkentry_v1,
-                .matchsize      = sizeof(struct ipt_addrtype_info_v1),
-                .me             = THIS_MODULE
-        }
-};
-static int __init addrtype_mt_init(void)
-{
-        return xt_register_matches(addrtype_mt_reg,
-                                   ARRAY_SIZE(addrtype_mt_reg));
-}
-static void __exit addrtype_mt_exit(void)
-{
-        xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
-}
-module_init(addrtype_mt_init);
-module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index af6e9c778345..2b57e52c746c 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -25,7 +25,8 @@ MODULE_LICENSE("GPL");
 static inline bool match_ip(const struct sk_buff *skb,
                            const struct ipt_ecn_info *einfo)
 {
-        return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect;
+        return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^
+               !!(einfo->invert & IPT_ECN_OP_MATCH_IP);
 }
 static inline bool match_tcp(const struct sk_buff *skb,
@@ -76,8 +77,6 @@ static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
                        return false;
        if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
-                if (ip_hdr(skb)->protocol != IPPROTO_TCP)
-                        return false;
                if (!match_tcp(skb, info, &par->hotdrop))
                        return false;
        }
@@ -97,7 +96,7 @@ static int ecn_mt_check(const struct xt_mtchk_param *par)
                return -EINVAL;
        if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
-            ip->proto != IPPROTO_TCP) {
+            (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) {
                pr_info("cannot match TCP bits in rule for non-tcp packets\n");
                return -EINVAL;
        }
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
        ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
                           dev_net(out)->ipv4.iptable_mangle);
        /* Reroute for ANY change. */
-        if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
+        if (ret != NF_DROP && ret != NF_STOLEN) {
                iph = ip_hdr(skb);
                if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5a03c02af999..de9da21113a1 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -101,7 +101,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
        /* This is where we call the helper: as the packet goes out. */
        ct = nf_ct_get(skb, &ctinfo);
-        if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)
+        if (!ct || ctinfo == IP_CT_RELATED_REPLY)
                goto out;
        help = nfct_help(ct);
@@ -121,7 +121,9 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
                return ret;
        }
-        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
+        /* adjust seqs for loopback traffic only in outgoing direction */
+        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+            !nf_is_loopback_packet(skb)) {
                typeof(nf_nat_seq_adjust_hook) seq_adjust;
                seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d68..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/percpu.h>
+#include <linux/security.h>
 #include <net/net_namespace.h>
 #include <linux/netfilter.h>
@@ -19,6 +20,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
 struct ct_iter_state {
        struct seq_net_private p;
@@ -34,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
        for (st->bucket = 0;
             st->bucket < net->ct.htable_size;
             st->bucket++) {
-                n = rcu_dereference(net->ct.hash[st->bucket].first);
+                n = rcu_dereference(
+                        hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
                if (!is_a_nulls(n))
                        return n;
        }
@@ -47,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
        struct net *net = seq_file_net(seq);
        struct ct_iter_state *st = seq->private;
-        head = rcu_dereference(head->next);
+        head = rcu_dereference(hlist_nulls_next_rcu(head));
        while (is_a_nulls(head)) {
                if (likely(get_nulls_value(head) == st->bucket)) {
                        if (++st->bucket >= net->ct.htable_size)
                                return NULL;
                }
-                head = rcu_dereference(net->ct.hash[st->bucket].first);
+                head = rcu_dereference(
+                        hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
        }
        return head;
 }
@@ -87,6 +91,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
        rcu_read_unlock();
 }
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+        int ret;
+        u32 len;
+        char *secctx;
+        ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+        if (ret)
+                return 0;
+        ret = seq_printf(s, "secctx=%s ", secctx);
+        security_release_secctx(secctx, len);
+        return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+        return 0;
+}
+#endif
 static int ct_seq_show(struct seq_file *s, void *v)
 {
        struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +175,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
                goto release;
 #endif
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
+        if (ct_show_secctx(s, ct))
-        if (seq_printf(s, "secmark=%u ", ct->secmark))
                goto release;
-#endif
        if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
                goto release;
@@ -195,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
        struct hlist_node *n;
        for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
-                n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+                n = rcu_dereference(
+                        hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
                if (n)
                        return n;
        }
@@ -208,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
        struct net *net = seq_file_net(seq);
        struct ct_expect_iter_state *st = seq->private;
-        head = rcu_dereference(head->next);
+        head = rcu_dereference(hlist_next_rcu(head));
        while (head == NULL) {
                if (++st->bucket >= nf_ct_expect_hsize)
                        return NULL;
-                head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+                head = rcu_dereference(
+                        hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
        }
        return head;
 }
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7404bde95994..ab5b27a2916f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
        /* Update skb to refer to this connection */
        skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
        skb->nfctinfo = *ctinfo;
-        return -NF_ACCEPT;
+        return NF_ACCEPT;
 }
 /* Small and modified version of icmp_rcv */
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
        /* Try to get same port: if not, try to change it. */
        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+                int res;
                exp->tuple.dst.u.tcp.port = htons(port);
-                if (nf_ct_expect_related(exp) == 0)
+                res = nf_ct_expect_related(exp);
+                if (res == 0)
+                        break;
+                else if (res != -EBUSY) {
+                        port = 0;
                        break;
+                }
        }
        if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93c..3346de5d94d0 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
 static struct nf_conntrack_l3proto *l3proto __read_mostly;
 #define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
                                                __read_mostly;
 static inline const struct nf_nat_protocol *
@@ -47,28 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum)
        return rcu_dereference(nf_nat_protos[protonum]);
 }
-const struct nf_nat_protocol *
-nf_nat_proto_find_get(u_int8_t protonum)
-{
-        const struct nf_nat_protocol *p;
-        rcu_read_lock();
-        p = __nf_nat_proto_find(protonum);
-        if (!try_module_get(p->me))
-                p = &nf_nat_unknown_protocol;
-        rcu_read_unlock();
-        return p;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
-void
-nf_nat_proto_put(const struct nf_nat_protocol *p)
-{
-        module_put(p->me);
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_put);
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
 hash_by_src(const struct net *net, u16 zone,
@@ -243,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
           manips not an issue.  */
        if (maniptype == IP_NAT_MANIP_SRC &&
            !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
-                if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
+                /* try the original tuple first */
+                if (in_range(orig_tuple, range)) {
+                        if (!nf_nat_used_tuple(orig_tuple, ct)) {
+                                *tuple = *orig_tuple;
+                                return;
+                        }
+                } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
+                           range)) {
                        pr_debug("get_unique_tuple: Found current src map\n");
                        if (!nf_nat_used_tuple(tuple, ct))
                                return;
@@ -262,11 +247,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
        proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
        /* Only bother mapping if it's not already in range and unique */
-        if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
+        if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
-            (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
+                if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
-             proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
+                        if (proto->in_range(tuple, maniptype, &range->min,
-            !nf_nat_used_tuple(tuple, ct))
+                                            &range->max) &&
-                goto out;
+                            (range->min.all == range->max.all ||
+                             !nf_nat_used_tuple(tuple, ct)))
+                                goto out;
+                } else if (!nf_nat_used_tuple(tuple, ct)) {
+                        goto out;
+                }
+        }
        /* Last change: get protocol to try to obtain unique tuple. */
        proto->unique_tuple(tuple, range, maniptype, ct);
@@ -282,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
        struct net *net = nf_ct_net(ct);
        struct nf_conntrack_tuple curr_tuple, new_tuple;
        struct nf_conn_nat *nat;
-        int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
        /* nat helper or nfctnetlink also setup binding */
        nat = nfct_nat(ct);
@@ -322,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
                        ct->status |= IPS_DST_NAT;
        }
-        /* Place in source hash if this is the first time. */
+        if (maniptype == IP_NAT_MANIP_SRC) {
-        if (have_to_hash) {
                unsigned int srchash;
                srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -339,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
        /* It's done. */
        if (maniptype == IP_NAT_MANIP_DST)
-                set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
+                ct->status |= IPS_DST_NAT_DONE;
        else
-                set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+                ct->status |= IPS_SRC_NAT_DONE;
        return NF_ACCEPT;
 }
@@ -444,7 +433,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
        /* Must be RELATED */
        NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
-                     skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
+                     skb->nfctinfo == IP_CT_RELATED_REPLY);
        /* Redirects on non-null nats must be dropped, else they'll
           start talking to each other without our translation, and be
@@ -458,6 +447,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
                        return 0;
        }
+        if (manip == IP_NAT_MANIP_SRC)
+                statusbit = IPS_SRC_NAT;
+        else
+                statusbit = IPS_DST_NAT;
+        /* Invert if this is reply dir. */
+        if (dir == IP_CT_DIR_REPLY)
+                statusbit ^= IPS_NAT_MASK;
+        if (!(ct->status & statusbit))
+                return 1;
        pr_debug("icmp_reply_translation: translating error %p manip %u "
                 "dir %s\n", skb, manip,
                 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -492,20 +493,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
        /* Change outer to look the reply to an incoming packet
         * (proto 0 means don't invert per-proto part). */
-        if (manip == IP_NAT_MANIP_SRC)
+        nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-                statusbit = IPS_SRC_NAT;
+        if (!manip_pkt(0, skb, 0, &target, manip))
-        else
+                return 0;
-                statusbit = IPS_DST_NAT;
-        /* Invert if this is reply dir. */
-        if (dir == IP_CT_DIR_REPLY)
-                statusbit ^= IPS_NAT_MASK;
-        if (ct->status & statusbit) {
-                nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-                if (!manip_pkt(0, skb, 0, &target, manip))
-                        return 0;
-        }
        return 1;
 }
@@ -517,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
        int ret = 0;
        spin_lock_bh(&nf_nat_lock);
-        if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
+        if (rcu_dereference_protected(
+                        nf_nat_protos[proto->protonum],
+                        lockdep_is_held(&nf_nat_lock)
+                        ) != &nf_nat_unknown_protocol) {
                ret = -EBUSY;
                goto out;
        }
@@ -528,7 +521,7 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
 }
 EXPORT_SYMBOL(nf_nat_protocol_register);
-/* Noone stores the protocol anywhere; simply delete it. */
+/* No one stores the protocol anywhere; simply delete it. */
 void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
 {
        spin_lock_bh(&nf_nat_lock);
@@ -539,7 +532,7 @@ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
 }
 EXPORT_SYMBOL(nf_nat_protocol_unregister);
-/* Noone using conntrack by the time this called. */
+/* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
        struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
@@ -547,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
        if (nat == NULL || nat->ct == NULL)
                return;
-        NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
+        NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
        spin_lock_bh(&nf_nat_lock);
        hlist_del_rcu(&nat->bysource);
@@ -560,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
        struct nf_conn_nat *old_nat = old;
        struct nf_conn *ct = old_nat->ct;
-        if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
+        if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
                return;
        spin_lock_bh(&nf_nat_lock);
-        new_nat->ct = ct;
        hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
        spin_unlock_bh(&nf_nat_lock);
 }
@@ -583,6 +575,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
+static const struct nf_nat_protocol *
+nf_nat_proto_find_get(u_int8_t protonum)
+{
+        const struct nf_nat_protocol *p;
+        rcu_read_lock();
+        p = __nf_nat_proto_find(protonum);
+        if (!try_module_get(p->me))
+                p = &nf_nat_unknown_protocol;
+        rcu_read_unlock();
+        return p;
+}
+static void
+nf_nat_proto_put(const struct nf_nat_protocol *p)
+{
+        module_put(p->me);
+}
 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
        [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
        [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
@@ -674,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
 {
        /* Leave them the same for the moment. */
        net->ipv4.nat_htable_size = net->ct.htable_size;
-        net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
+        net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
-                                                       &net->ipv4.nat_vmalloced, 0);
        if (!net->ipv4.nat_bysource)
                return -ENOMEM;
        return 0;
@@ -697,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
 {
        nf_ct_iterate_cleanup(net, &clean_nat, NULL);
        synchronize_rcu();
-        nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
+        nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
-                             net->ipv4.nat_htable_size);
 }
 static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a0..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
        /* Try to get same port: if not, try to change it. */
        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+                int ret;
                exp->tuple.dst.u.tcp.port = htons(port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
+                        break;
+                else if (ret != -EBUSY) {
+                        port = 0;
                        break;
+                }
        }
        if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853c..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
        /* Try to get a pair of ports. */
        for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
             nated_port != 0; nated_port += 2) {
+                int ret;
                rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
-                if (nf_ct_expect_related(rtp_exp) == 0) {
+                ret = nf_ct_expect_related(rtp_exp);
+                if (ret == 0) {
                        rtcp_exp->tuple.dst.u.udp.port =
                            htons(nated_port + 1);
-                        if (nf_ct_expect_related(rtcp_exp) == 0)
+                        ret = nf_ct_expect_related(rtcp_exp);
+                        if (ret == 0)
+                                break;
+                        else if (ret != -EBUSY) {
+                                nf_ct_unexpect_related(rtp_exp);
+                                nated_port = 0;
                                break;
-                        nf_ct_unexpect_related(rtp_exp);
+                        }
+                } else if (ret != -EBUSY) {
+                        nated_port = 0;
+                        break;
                }
        }
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
        /* Try to get same port: if not, try to change it. */
        for (; nated_port != 0; nated_port++) {
+                int ret;
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
+                        break;
+                else if (ret != -EBUSY) {
+                        nated_port = 0;
                        break;
+                }
        }
        if (nated_port == 0) {  /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
        /* Try to get same port: if not, try to change it. */
        for (; nated_port != 0; nated_port++) {
+                int ret;
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
                        break;
+                else if (ret != -EBUSY) {
+                        nated_port = 0;
+                        break;
+                }
        }
        if (nated_port == 0) {  /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
        /* Try to get same port: if not, try to change it. */
        for (; nated_port != 0; nated_port++) {
+                int ret;
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
+                        break;
+                else if (ret != -EBUSY) {
+                        nated_port = 0;
                        break;
+                }
        }
        if (nated_port == 0) {  /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
        /* Try to get same port: if not, try to change it. */
        for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+                int ret;
                exp->tuple.dst.u.tcp.port = htons(nated_port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
                        break;
+                else if (ret != -EBUSY) {
+                        nated_port = 0;
+                        break;
+                }
        }
        if (nated_port == 0) {  /* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548eee..ebc5f8894f99 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 }
 EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
+                        int datalen, __sum16 *check, int oldlen)
+{
+        struct rtable *rt = skb_rtable(skb);
+        if (skb->ip_summed != CHECKSUM_PARTIAL) {
+                if (!(rt->rt_flags & RTCF_LOCAL) &&
+                    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+                        skb->ip_summed = CHECKSUM_PARTIAL;
+                        skb->csum_start = skb_headroom(skb) +
+                                          skb_network_offset(skb) +
+                                          iph->ihl * 4;
+                        skb->csum_offset = (void *)check - data;
+                        *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                    datalen, iph->protocol, 0);
+                } else {
+                        *check = 0;
+                        *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                                   datalen, iph->protocol,
+                                                   csum_partial(data, datalen,
+                                                                0));
+                        if (iph->protocol == IPPROTO_UDP && !*check)
+                                *check = CSUM_MANGLED_0;
+                }
+        } else
+                inet_proto_csum_replace2(check, skb,
+                                         htons(oldlen), htons(datalen), 1);
+}
 /* Generic function for mangling variable-length address changes inside
 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
 * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
                               const char *rep_buffer,
                               unsigned int rep_len, bool adjust)
 {
-        struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph;
        struct tcphdr *tcph;
        int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
                        match_offset, match_len, rep_buffer, rep_len);
        datalen = skb->len - iph->ihl*4;
-        if (skb->ip_summed != CHECKSUM_PARTIAL) {
+        nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
-                if (!(rt->rt_flags & RTCF_LOCAL) &&
-                    skb->dev->features & NETIF_F_V4_CSUM) {
-                        skb->ip_summed = CHECKSUM_PARTIAL;
-                        skb->csum_start = skb_headroom(skb) +
-                                          skb_network_offset(skb) +
-                                          iph->ihl * 4;
-                        skb->csum_offset = offsetof(struct tcphdr, check);
-                        tcph->check = ~tcp_v4_check(datalen,
-                                                    iph->saddr, iph->daddr, 0);
-                } else {
-                        tcph->check = 0;
-                        tcph->check = tcp_v4_check(datalen,
-                                                   iph->saddr, iph->daddr,
-                                                   csum_partial(tcph,
-                                                                datalen, 0));
-                }
-        } else
-                inet_proto_csum_replace2(&tcph->check, skb,
-                                         htons(oldlen), htons(datalen), 1);
        if (adjust && rep_len != match_len)
                nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
                         const char *rep_buffer,
                         unsigned int rep_len)
 {
-        struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph;
        struct udphdr *udph;
        int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
        if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
                return 1;
-        if (skb->ip_summed != CHECKSUM_PARTIAL) {
+        nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
-                if (!(rt->rt_flags & RTCF_LOCAL) &&
-                    skb->dev->features & NETIF_F_V4_CSUM) {
-                        skb->ip_summed = CHECKSUM_PARTIAL;
-                        skb->csum_start = skb_headroom(skb) +
-                                          skb_network_offset(skb) +
-                                          iph->ihl * 4;
-                        skb->csum_offset = offsetof(struct udphdr, check);
-                        udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                         datalen, IPPROTO_UDP,
-                                                         0);
-                } else {
-                        udph->check = 0;
-                        udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                        datalen, IPPROTO_UDP,
-                                                        csum_partial(udph,
-                                                                     datalen, 0));
-                        if (!udph->check)
-                                udph->check = CSUM_MANGLED_0;
-                }
-        } else
-                inet_proto_csum_replace2(&udph->check, skb,
-                                         htons(oldlen), htons(datalen), 1);
        return 1;
 }
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
        /* Try to get same port: if not, try to change it. */
        for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+                int ret;
                exp->tuple.dst.u.tcp.port = htons(port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
+                        break;
+                else if (ret != -EBUSY) {
+                        port = 0;
                        break;
+                }
        }
        if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319f62f5..733c9abc1cbd 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -53,7 +53,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
        /* Connection must be valid and new. */
        NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-                            ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+                            ctinfo == IP_CT_RELATED_REPLY));
        NF_CT_ASSERT(par->out != NULL);
        return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 {
        /* Force range to this IP; let proto decide mapping for
           per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
-           Use reply in case it's already been mangled (eg local packet).
        */
-        __be32 ip
+        struct nf_nat_range range;
-                = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
-                   ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
+        range.flags = 0;
-                   : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+        pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
-        struct nf_nat_range range
+                 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
-                = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
+                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-        pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
        return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
 }
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaaec..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
        exp->expectfn = ip_nat_sip_expected;
        for (; port != 0; port++) {
+                int ret;
                exp->tuple.dst.u.udp.port = htons(port);
-                if (nf_ct_expect_related(exp) == 0)
+                ret = nf_ct_expect_related(exp);
+                if (ret == 0)
+                        break;
+                else if (ret != -EBUSY) {
+                        port = 0;
                        break;
+                }
        }
        if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
        /* Try to get same pair of ports: if not, try to change them. */
        for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
             port != 0; port += 2) {
+                int ret;
                rtp_exp->tuple.dst.u.udp.port = htons(port);
-                if (nf_ct_expect_related(rtp_exp) != 0)
+                ret = nf_ct_expect_related(rtp_exp);
+                if (ret == -EBUSY)
                        continue;
+                else if (ret < 0) {
+                        port = 0;
+                        break;
+                }
                rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
-                if (nf_ct_expect_related(rtcp_exp) == 0)
+                ret = nf_ct_expect_related(rtcp_exp);
+                if (ret == 0)
                        break;
-                nf_ct_unexpect_related(rtp_exp);
+                else if (ret != -EBUSY) {
+                        nf_ct_unexpect_related(rtp_exp);
+                        port = 0;
+                        break;
+                }
        }
        if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
 {
        int ret = 0;
-        ret = nf_conntrack_helper_register(&snmp_helper);
+        BUG_ON(nf_nat_snmp_hook != NULL);
-        if (ret < 0)
+        rcu_assign_pointer(nf_nat_snmp_hook, help);
-                return ret;
        ret = nf_conntrack_helper_register(&snmp_trap_helper);
        if (ret < 0) {
                nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
 static void __exit nf_nat_snmp_basic_fini(void)
 {
-        nf_conntrack_helper_unregister(&snmp_helper);
+        rcu_assign_pointer(nf_nat_snmp_hook, NULL);
        nf_conntrack_helper_unregister(&snmp_trap_helper);
 }
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 95481fee8bdb..483b76d042da 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -31,6 +31,7 @@
 #ifdef CONFIG_XFRM
 static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
 {
+        struct flowi4 *fl4 = &fl->u.ip4;
        const struct nf_conn *ct;
        const struct nf_conntrack_tuple *t;
        enum ip_conntrack_info ctinfo;
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
                statusbit = IPS_SRC_NAT;
        if (ct->status & statusbit) {
-                fl->fl4_dst = t->dst.u3.ip;
+                fl4->daddr = t->dst.u3.ip;
                if (t->dst.protonum == IPPROTO_TCP ||
                    t->dst.protonum == IPPROTO_UDP ||
                    t->dst.protonum == IPPROTO_UDPLITE ||
                    t->dst.protonum == IPPROTO_DCCP ||
                    t->dst.protonum == IPPROTO_SCTP)
-                        fl->fl_ip_dport = t->dst.u.tcp.port;
+                        fl4->fl4_dport = t->dst.u.tcp.port;
        }
        statusbit ^= IPS_NAT_MASK;
        if (ct->status & statusbit) {
-                fl->fl4_src = t->src.u3.ip;
+                fl4->saddr = t->src.u3.ip;
                if (t->dst.protonum == IPPROTO_TCP ||
                    t->dst.protonum == IPPROTO_UDP ||
                    t->dst.protonum == IPPROTO_UDPLITE ||
                    t->dst.protonum == IPPROTO_DCCP ||
                    t->dst.protonum == IPPROTO_SCTP)
-                        fl->fl_ip_sport = t->src.u.tcp.port;
+                        fl4->fl4_sport = t->src.u.tcp.port;
        }
 }
 #endif
@@ -115,7 +116,7 @@ nf_nat_fn(unsigned int hooknum,
        switch (ctinfo) {
        case IP_CT_RELATED:
-        case IP_CT_RELATED+IP_CT_IS_REPLY:
+        case IP_CT_RELATED_REPLY:
                if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
                        if (!nf_nat_icmp_reply_translation(ct, ctinfo,
                                                           hooknum, skb))
@@ -143,7 +144,7 @@ nf_nat_fn(unsigned int hooknum,
        default:
                /* ESTABLISHED */
                NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-                             ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
+                             ctinfo == IP_CT_ESTABLISHED_REPLY);
        }
        return nf_nat_packet(ct, ctinfo, hooknum, skb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 000000000000..39b403f854c6
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,931 @@
+/*
+ * INET         An implementation of the TCP/IP protocol suite for the LINUX
+ *              operating system.  INET is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              "Ping" sockets
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors:     Vasiliy Kulikov / Openwall (for Linux 2.6),
+ *              Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+static struct ping_table ping_table;
+static u16 ping_port_rover;
+static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
+{
+        int res = (num + net_hash_mix(net)) & mask;
+        pr_debug("hash(%d) = %d\n", num, res);
+        return res;
+}
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+                                             struct net *net, unsigned num)
+{
+        return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+static int ping_v4_get_port(struct sock *sk, unsigned short ident)
+{
+        struct hlist_nulls_node *node;
+        struct hlist_nulls_head *hlist;
+        struct inet_sock *isk, *isk2;
+        struct sock *sk2 = NULL;
+        isk = inet_sk(sk);
+        write_lock_bh(&ping_table.lock);
+        if (ident == 0) {
+                u32 i;
+                u16 result = ping_port_rover + 1;
+                for (i = 0; i < (1L << 16); i++, result++) {
+                        if (!result)
+                                result++; /* avoid zero */
+                        hlist = ping_hashslot(&ping_table, sock_net(sk),
+                                            result);
+                        ping_portaddr_for_each_entry(sk2, node, hlist) {
+                                isk2 = inet_sk(sk2);
+                                if (isk2->inet_num == result)
+                                        goto next_port;
+                        }
+                        /* found */
+                        ping_port_rover = ident = result;
+                        break;
+next_port:
+                        ;
+                }
+                if (i >= (1L << 16))
+                        goto fail;
+        } else {
+                hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+                ping_portaddr_for_each_entry(sk2, node, hlist) {
+                        isk2 = inet_sk(sk2);
+                        if ((isk2->inet_num == ident) &&
+                            (sk2 != sk) &&
+                            (!sk2->sk_reuse || !sk->sk_reuse))
+                                goto fail;
+                }
+        }
+        pr_debug("found port/ident = %d\n", ident);
+        isk->inet_num = ident;
+        if (sk_unhashed(sk)) {
+                pr_debug("was not hashed\n");
+                sock_hold(sk);
+                hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+        }
+        write_unlock_bh(&ping_table.lock);
+        return 0;
+fail:
+        write_unlock_bh(&ping_table.lock);
+        return 1;
+}
+static void ping_v4_hash(struct sock *sk)
+{
+        pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+        BUG(); /* "Please do not press this button again." */
+}
+static void ping_v4_unhash(struct sock *sk)
+{
+        struct inet_sock *isk = inet_sk(sk);
+        pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+        if (sk_hashed(sk)) {
+                write_lock_bh(&ping_table.lock);
+                hlist_nulls_del(&sk->sk_nulls_node);
+                sock_put(sk);
+                isk->inet_num = isk->inet_sport = 0;
+                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+                write_unlock_bh(&ping_table.lock);
+        }
+}
+static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
+                                   u16 ident, int dif)
+{
+        struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+        struct sock *sk = NULL;
+        struct inet_sock *isk;
+        struct hlist_nulls_node *hnode;
+        pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
+                         (int)ident, (unsigned long)daddr, dif);
+        read_lock_bh(&ping_table.lock);
+        ping_portaddr_for_each_entry(sk, hnode, hslot) {
+                isk = inet_sk(sk);
+                pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
+                         (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
+                         sk->sk_bound_dev_if);
+                pr_debug("iterate\n");
+                if (isk->inet_num != ident)
+                        continue;
+                if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr)
+                        continue;
+                if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+                        continue;
+                sock_hold(sk);
+                goto exit;
+        }
+        sk = NULL;
+exit:
+        read_unlock_bh(&ping_table.lock);
+        return sk;
+}
+static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
+                                          gid_t *high)
+{
+        gid_t *data = net->ipv4.sysctl_ping_group_range;
+        unsigned seq;
+        do {
+                seq = read_seqbegin(&sysctl_local_ports.lock);
+                *low = data[0];
+                *high = data[1];
+        } while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+static int ping_init_sock(struct sock *sk)
+{
+        struct net *net = sock_net(sk);
+        gid_t group = current_egid();
+        gid_t range[2];
+        struct group_info *group_info = get_current_groups();
+        int i, j, count = group_info->ngroups;
+        inet_get_ping_group_range_net(net, range, range+1);
+        if (range[0] <= group && group <= range[1])
+                return 0;
+        for (i = 0; i < group_info->nblocks; i++) {
+                int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+                for (j = 0; j < cp_count; j++) {
+                        group = group_info->blocks[i][j];
+                        if (range[0] <= group && group <= range[1])
+                                return 0;
+                }
+                count -= cp_count;
+        }
+        return -EACCES;
+}
+static void ping_close(struct sock *sk, long timeout)
+{
+        pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+                inet_sk(sk), inet_sk(sk)->inet_num);
+        pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+        sk_common_release(sk);
+}
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+        struct inet_sock *isk = inet_sk(sk);
+        unsigned short snum;
+        int chk_addr_ret;
+        int err;
+        if (addr_len < sizeof(struct sockaddr_in))
+                return -EINVAL;
+        pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
+                sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
+        chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+        if (addr->sin_addr.s_addr == INADDR_ANY)
+                chk_addr_ret = RTN_LOCAL;
+        if ((sysctl_ip_nonlocal_bind == 0 &&
+            isk->freebind == 0 && isk->transparent == 0 &&
+             chk_addr_ret != RTN_LOCAL) ||
+            chk_addr_ret == RTN_MULTICAST ||
+            chk_addr_ret == RTN_BROADCAST)
+                return -EADDRNOTAVAIL;
+        lock_sock(sk);
+        err = -EINVAL;
+        if (isk->inet_num != 0)
+                goto out;
+        err = -EADDRINUSE;
+        isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+        snum = ntohs(addr->sin_port);
+        if (ping_v4_get_port(sk, snum) != 0) {
+                isk->inet_saddr = isk->inet_rcv_saddr = 0;
+                goto out;
+        }
+        pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
+                (int)isk->inet_num,
+                (unsigned long) isk->inet_rcv_saddr,
+                (int)sk->sk_bound_dev_if);
+        err = 0;
+        if (isk->inet_rcv_saddr)
+                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+        if (snum)
+                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+        isk->inet_sport = htons(isk->inet_num);
+        isk->inet_daddr = 0;
+        isk->inet_dport = 0;
+        sk_dst_reset(sk);
+out:
+        release_sock(sk);
+        pr_debug("ping_v4_bind -> %d\n", err);
+        return err;
+}
+/*
+ * Is this a supported type of ICMP message?
+ */
+static inline int ping_supported(int type, int code)
+{
+        if (type == ICMP_ECHO && code == 0)
+                return 1;
+        return 0;
+}
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void ping_err(struct sk_buff *skb, u32 info)
+{
+        struct iphdr *iph = (struct iphdr *)skb->data;
+        struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+        struct inet_sock *inet_sock;
+        int type = icmph->type;
+        int code = icmph->code;
+        struct net *net = dev_net(skb->dev);
+        struct sock *sk;
+        int harderr;
+        int err;
+        /* We assume the packet has already been checked by icmp_unreach */
+        if (!ping_supported(icmph->type, icmph->code))
+                return;
+        pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
+                code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+        sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
+                            ntohs(icmph->un.echo.id), skb->dev->ifindex);
+        if (sk == NULL) {
+                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+                pr_debug("no socket, dropping\n");
+                return; /* No socket for error */
+        }
+        pr_debug("err on socket %p\n", sk);
+        err = 0;
+        harderr = 0;
+        inet_sock = inet_sk(sk);
+        switch (type) {
+        default:
+        case ICMP_TIME_EXCEEDED:
+                err = EHOSTUNREACH;
+                break;
+        case ICMP_SOURCE_QUENCH:
+                /* This is not a real error but ping wants to see it.
+                 * Report it with some fake errno. */
+                err = EREMOTEIO;
+                break;
+        case ICMP_PARAMETERPROB:
+                err = EPROTO;
+                harderr = 1;
+                break;
+        case ICMP_DEST_UNREACH:
+                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+                        if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+                                err = EMSGSIZE;
+                                harderr = 1;
+                                break;
+                        }
+                        goto out;
+                }
+                err = EHOSTUNREACH;
+                if (code <= NR_ICMP_UNREACH) {
+                        harderr = icmp_err_convert[code].fatal;
+                        err = icmp_err_convert[code].errno;
+                }
+                break;
+        case ICMP_REDIRECT:
+                /* See ICMP_SOURCE_QUENCH */
+                err = EREMOTEIO;
+                break;
+        }
+        /*
+         *      RFC1122: OK.  Passes ICMP errors back to application, as per
+         *      4.1.3.3.
+         */
+        if (!inet_sock->recverr) {
+                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+                        goto out;
+        } else {
+                ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+                         info, (u8 *)icmph);
+        }
+        sk->sk_err = err;
+        sk->sk_error_report(sk);
+out:
+        sock_put(sk);
+}
+/*
+ *      Copy and checksum an ICMP Echo packet from user space into a buffer.
+ */
+struct pingfakehdr {
+        struct icmphdr icmph;
+        struct iovec *iov;
+        u32 wcheck;
+};
+static int ping_getfrag(void *from, char * to,
+                        int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+        struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+        if (offset == 0) {
+                if (fraglen < sizeof(struct icmphdr))
+                        BUG();
+                if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
+                            pfh->iov, 0, fraglen - sizeof(struct icmphdr),
+                            &pfh->wcheck))
+                        return -EFAULT;
+                return 0;
+        }
+        if (offset < sizeof(struct icmphdr))
+                BUG();
+        if (csum_partial_copy_fromiovecend
+                        (to, pfh->iov, offset - sizeof(struct icmphdr),
+                         fraglen, &pfh->wcheck))
+                return -EFAULT;
+        return 0;
+}
+static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+                                    struct flowi4 *fl4)
+{
+        struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+        pfh->wcheck = csum_partial((char *)&pfh->icmph,
+                sizeof(struct icmphdr), pfh->wcheck);
+        pfh->icmph.checksum = csum_fold(pfh->wcheck);
+        memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+        skb->ip_summed = CHECKSUM_NONE;
+        return ip_push_pending_frames(sk, fl4);
+}
+static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                        size_t len)
+{
+        struct net *net = sock_net(sk);
+        struct flowi4 fl4;
+        struct inet_sock *inet = inet_sk(sk);
+        struct ipcm_cookie ipc;
+        struct icmphdr user_icmph;
+        struct pingfakehdr pfh;
+        struct rtable *rt = NULL;
+        struct ip_options_data opt_copy;
+        int free = 0;
+        u32 saddr, daddr, faddr;
+        u8  tos;
+        int err;
+        pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+        if (len > 0xFFFF)
+                return -EMSGSIZE;
+        /*
+         *      Check the flags.
+         */
+        /* Mirror BSD error message compatibility */
+        if (msg->msg_flags & MSG_OOB)
+                return -EOPNOTSUPP;
+        /*
+         *      Fetch the ICMP header provided by the userland.
+         *      iovec is modified!
+         */
+        if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov,
+                             sizeof(struct icmphdr)))
+                return -EFAULT;
+        if (!ping_supported(user_icmph.type, user_icmph.code))
+                return -EINVAL;
+        /*
+         *      Get and verify the address.
+         */
+        if (msg->msg_name) {
+                struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+                if (msg->msg_namelen < sizeof(*usin))
+                        return -EINVAL;
+                if (usin->sin_family != AF_INET)
+                        return -EINVAL;
+                daddr = usin->sin_addr.s_addr;
+                /* no remote port */
+        } else {
+                if (sk->sk_state != TCP_ESTABLISHED)
+                        return -EDESTADDRREQ;
+                daddr = inet->inet_daddr;
+                /* no remote port */
+        }
+        ipc.addr = inet->inet_saddr;
+        ipc.opt = NULL;
+        ipc.oif = sk->sk_bound_dev_if;
+        ipc.tx_flags = 0;
+        err = sock_tx_timestamp(sk, &ipc.tx_flags);
+        if (err)
+                return err;
+        if (msg->msg_controllen) {
+                err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+                if (err)
+                        return err;
+                if (ipc.opt)
+                        free = 1;
+        }
+        if (!ipc.opt) {
+                struct ip_options_rcu *inet_opt;
+                rcu_read_lock();
+                inet_opt = rcu_dereference(inet->inet_opt);
+                if (inet_opt) {
+                        memcpy(&opt_copy, inet_opt,
+                               sizeof(*inet_opt) + inet_opt->opt.optlen);
+                        ipc.opt = &opt_copy.opt;
+                }
+                rcu_read_unlock();
+        }
+        saddr = ipc.addr;
+        ipc.addr = faddr = daddr;
+        if (ipc.opt && ipc.opt->opt.srr) {
+                if (!daddr)
+                        return -EINVAL;
+                faddr = ipc.opt->opt.faddr;
+        }
+        tos = RT_TOS(inet->tos);
+        if (sock_flag(sk, SOCK_LOCALROUTE) ||
+            (msg->msg_flags & MSG_DONTROUTE) ||
+            (ipc.opt && ipc.opt->opt.is_strictroute)) {
+                tos |= RTO_ONLINK;
+        }
+        if (ipv4_is_multicast(daddr)) {
+                if (!ipc.oif)
+                        ipc.oif = inet->mc_index;
+                if (!saddr)
+                        saddr = inet->mc_addr;
+        }
+        flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+                           RT_SCOPE_UNIVERSE, sk->sk_protocol,
+                           inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+        security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+        rt = ip_route_output_flow(net, &fl4, sk);
+        if (IS_ERR(rt)) {
+                err = PTR_ERR(rt);
+                rt = NULL;
+                if (err == -ENETUNREACH)
+                        IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+                goto out;
+        }
+        err = -EACCES;
+        if ((rt->rt_flags & RTCF_BROADCAST) &&
+            !sock_flag(sk, SOCK_BROADCAST))
+                goto out;
+        if (msg->msg_flags & MSG_CONFIRM)
+                goto do_confirm;
+back_from_confirm:
+        if (!ipc.addr)
+                ipc.addr = fl4.daddr;
+        lock_sock(sk);
+        pfh.icmph.type = user_icmph.type; /* already checked */
+        pfh.icmph.code = user_icmph.code; /* ditto */
+        pfh.icmph.checksum = 0;
+        pfh.icmph.un.echo.id = inet->inet_sport;
+        pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+        pfh.iov = msg->msg_iov;
+        pfh.wcheck = 0;
+        err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+                        0, &ipc, &rt, msg->msg_flags);
+        if (err)
+                ip_flush_pending_frames(sk);
+        else
+                err = ping_push_pending_frames(sk, &pfh, &fl4);
+        release_sock(sk);
+out:
+        ip_rt_put(rt);
+        if (free)
+                kfree(ipc.opt);
+        if (!err) {
+                icmp_out_count(sock_net(sk), user_icmph.type);
+                return len;
+        }
+        return err;
+do_confirm:
+        dst_confirm(&rt->dst);
+        if (!(msg->msg_flags & MSG_PROBE) || len)
+                goto back_from_confirm;
+        err = 0;
+        goto out;
+}
+static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+                        size_t len, int noblock, int flags, int *addr_len)
+{
+        struct inet_sock *isk = inet_sk(sk);
+        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+        struct sk_buff *skb;
+        int copied, err;
+        pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+        if (flags & MSG_OOB)
+                goto out;
+        if (addr_len)
+                *addr_len = sizeof(*sin);
+        if (flags & MSG_ERRQUEUE)
+                return ip_recv_error(sk, msg, len);
+        skb = skb_recv_datagram(sk, flags, noblock, &err);
+        if (!skb)
+                goto out;
+        copied = skb->len;
+        if (copied > len) {
+                msg->msg_flags |= MSG_TRUNC;
+                copied = len;
+        }
+        /* Don't bother checking the checksum */
+        err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+        if (err)
+                goto done;
+        sock_recv_timestamp(msg, sk, skb);
+        /* Copy the address. */
+        if (sin) {
+                sin->sin_family = AF_INET;
+                sin->sin_port = 0 /* skb->h.uh->source */;
+                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+        }
+        if (isk->cmsg_flags)
+                ip_cmsg_recv(msg, skb);
+        err = copied;
+done:
+        skb_free_datagram(sk, skb);
+out:
+        pr_debug("ping_recvmsg -> %d\n", err);
+        return err;
+}
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+        pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+                inet_sk(sk), inet_sk(sk)->inet_num, skb);
+        if (sock_queue_rcv_skb(sk, skb) < 0) {
+                ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
+                kfree_skb(skb);
+                pr_debug("ping_queue_rcv_skb -> failed\n");
+                return -1;
+        }
+        return 0;
+}
+/*
+ *      All we need to do is get the socket.
+ */
+void ping_rcv(struct sk_buff *skb)
+{
+        struct sock *sk;
+        struct net *net = dev_net(skb->dev);
+        struct iphdr *iph = ip_hdr(skb);
+        struct icmphdr *icmph = icmp_hdr(skb);
+        u32 saddr = iph->saddr;
+        u32 daddr = iph->daddr;
+        /* We assume the packet has already been checked by icmp_rcv */
+        pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+                skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+        /* Push ICMP header back */
+        skb_push(skb, skb->data - (u8 *)icmph);
+        sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id),
+                            skb->dev->ifindex);
+        if (sk != NULL) {
+                pr_debug("rcv on socket %p\n", sk);
+                ping_queue_rcv_skb(sk, skb_get(skb));
+                sock_put(sk);
+                return;
+        }
+        pr_debug("no socket, dropping\n");
+        /* We're called from icmp_rcv(). kfree_skb() is done there. */
+}
+struct proto ping_prot = {
+        .name =         "PING",
+        .owner =        THIS_MODULE,
+        .init =         ping_init_sock,
+        .close =        ping_close,
+        .connect =      ip4_datagram_connect,
+        .disconnect =   udp_disconnect,
+        .setsockopt =   ip_setsockopt,
+        .getsockopt =   ip_getsockopt,
+        .sendmsg =      ping_sendmsg,
+        .recvmsg =      ping_recvmsg,
+        .bind =         ping_bind,
+        .backlog_rcv =  ping_queue_rcv_skb,
+        .hash =         ping_v4_hash,
+        .unhash =       ping_v4_unhash,
+        .get_port =     ping_v4_get_port,
+        .obj_size =     sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+#ifdef CONFIG_PROC_FS
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+        struct sock *sk;
+        struct ping_iter_state *state = seq->private;
+        struct net *net = seq_file_net(seq);
+        for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+             ++state->bucket) {
+                struct hlist_nulls_node *node;
+                struct hlist_nulls_head *hslot;
+                hslot = &ping_table.hash[state->bucket];
+                if (hlist_nulls_empty(hslot))
+                        continue;
+                sk_nulls_for_each(sk, node, hslot) {
+                        if (net_eq(sock_net(sk), net))
+                                goto found;
+                }
+        }
+        sk = NULL;
+found:
+        return sk;
+}
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+        struct ping_iter_state *state = seq->private;
+        struct net *net = seq_file_net(seq);
+        do {
+                sk = sk_nulls_next(sk);
+        } while (sk && (!net_eq(sock_net(sk), net)));
+        if (!sk)
+                return ping_get_first(seq, state->bucket + 1);
+        return sk;
+}
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct sock *sk = ping_get_first(seq, 0);
+        if (sk)
+                while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+                        --pos;
+        return pos ? NULL : sk;
+}
+static void *ping_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct ping_iter_state *state = seq->private;
+        state->bucket = 0;
+        read_lock_bh(&ping_table.lock);
+        return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        struct sock *sk;
+        if (v == SEQ_START_TOKEN)
+                sk = ping_get_idx(seq, 0);
+        else
+                sk = ping_get_next(seq, v);
+        ++*pos;
+        return sk;
+}
+static void ping_seq_stop(struct seq_file *seq, void *v)
+{
+        read_unlock_bh(&ping_table.lock);
+}
+static void ping_format_sock(struct sock *sp, struct seq_file *f,
+                int bucket, int *len)
+{
+        struct inet_sock *inet = inet_sk(sp);
+        __be32 dest = inet->inet_daddr;
+        __be32 src = inet->inet_rcv_saddr;
+        __u16 destp = ntohs(inet->inet_dport);
+        __u16 srcp = ntohs(inet->inet_sport);
+        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+                bucket, src, srcp, dest, destp, sp->sk_state,
+                sk_wmem_alloc_get(sp),
+                sk_rmem_alloc_get(sp),
+                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+                atomic_read(&sp->sk_refcnt), sp,
+                atomic_read(&sp->sk_drops), len);
+}
+static int ping_seq_show(struct seq_file *seq, void *v)
+{
+        if (v == SEQ_START_TOKEN)
+                seq_printf(seq, "%-127s\n",
+                           "  sl  local_address rem_address   st tx_queue "
+                           "rx_queue tr tm->when retrnsmt   uid  timeout "
+                           "inode ref pointer drops");
+        else {
+                struct ping_iter_state *state = seq->private;
+                int len;
+                ping_format_sock(v, seq, state->bucket, &len);
+                seq_printf(seq, "%*s\n", 127 - len, "");
+        }
+        return 0;
+}
+static const struct seq_operations ping_seq_ops = {
+        .show           = ping_seq_show,
+        .start          = ping_seq_start,
+        .next           = ping_seq_next,
+        .stop           = ping_seq_stop,
+};
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+        return seq_open_net(inode, file, &ping_seq_ops,
+                           sizeof(struct ping_iter_state));
+}
+static const struct file_operations ping_seq_fops = {
+        .open           = ping_seq_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_net,
+};
+static int ping_proc_register(struct net *net)
+{
+        struct proc_dir_entry *p;
+        int rc = 0;
+        p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops);
+        if (!p)
+                rc = -ENOMEM;
+        return rc;
+}
+static void ping_proc_unregister(struct net *net)
+{
+        proc_net_remove(net, "icmp");
+}
+static int __net_init ping_proc_init_net(struct net *net)
+{
+        return ping_proc_register(net);
+}
+static void __net_exit ping_proc_exit_net(struct net *net)
+{
+        ping_proc_unregister(net);
+}
+static struct pernet_operations ping_net_ops = {
+        .init = ping_proc_init_net,
+        .exit = ping_proc_exit_net,
+};
+int __init ping_proc_init(void)
+{
+        return register_pernet_subsys(&ping_net_ops);
+}
+void ping_proc_exit(void)
+{
+        unregister_pernet_subsys(&ping_net_ops);
+}
+#endif
+void __init ping_init(void)
+{
+        int i;
+        for (i = 0; i < PING_HTABLE_SIZE; i++)
+                INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+        rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4ae1f203f7cb..b14ec7d03b6e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
        local_bh_enable();
        socket_seq_show(seq);
-        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
                   sock_prot_inuse_get(net, &tcp_prot), orphans,
                   tcp_death_row.tw_count, sockets,
-                   atomic_read(&tcp_memory_allocated));
+                   atomic_long_read(&tcp_memory_allocated));
-        seq_printf(seq, "UDP: inuse %d mem %d\n",
+        seq_printf(seq, "UDP: inuse %d mem %ld\n",
                   sock_prot_inuse_get(net, &udp_prot),
-                   atomic_read(&udp_memory_allocated));
+                   atomic_long_read(&udp_memory_allocated));
        seq_printf(seq, "UDPLITE: inuse %d\n",
                   sock_prot_inuse_get(net, &udplite_prot));
        seq_printf(seq, "RAW: inuse %d\n",
@@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
        SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
        SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+        SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d297351405..9ae5c01cd0b2 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
 #include <linux/spinlock.h>
 #include <net/protocol.h>
-const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
+const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
-static DEFINE_SPINLOCK(inet_proto_lock);
 /*
 *      Add a protocol handler to the hash tables
@@ -37,20 +36,10 @@ static DEFINE_SPINLOCK(inet_proto_lock);
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-        int hash, ret;
+        int hash = protocol & (MAX_INET_PROTOS - 1);
-        hash = protocol & (MAX_INET_PROTOS - 1);
+        return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+                        NULL, prot) ? 0 : -1;
-        spin_lock_bh(&inet_proto_lock);
-        if (inet_protos[hash]) {
-                ret = -1;
-        } else {
-                inet_protos[hash] = prot;
-                ret = 0;
-        }
-        spin_unlock_bh(&inet_proto_lock);
-        return ret;
 }
 EXPORT_SYMBOL(inet_add_protocol);
@@ -60,18 +49,10 @@ EXPORT_SYMBOL(inet_add_protocol);
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-        int hash, ret;
+        int ret, hash = protocol & (MAX_INET_PROTOS - 1);
-        hash = protocol & (MAX_INET_PROTOS - 1);
-        spin_lock_bh(&inet_proto_lock);
+        ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
-        if (inet_protos[hash] == prot) {
+                       prot, NULL) == prot) ? 0 : -1;
-                inet_protos[hash] = NULL;
-                ret = 0;
-        } else {
-                ret = -1;
-        }
-        spin_unlock_bh(&inet_proto_lock);
        synchronize_net();
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1ef..c9893d43242e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -76,6 +76,7 @@
 #include <linux/seq_file.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
 static struct raw_hashinfo raw_v4_hashinfo = {
        .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -153,7 +154,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
 * RFC 1122: SHOULD pass TOS value up to the transport layer.
 * -> It does. And not only TOS, but all IP header.
 */
-static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 {
        struct sock *sk;
        struct hlist_head *head;
@@ -246,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
        }
        if (inet->recverr) {
-                struct iphdr *iph = (struct iphdr *)skb->data;
+                const struct iphdr *iph = (const struct iphdr *)skb->data;
                u8 *payload = skb->data + (iph->ihl << 2);
                if (inet->hdrincl)
@@ -264,7 +265,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
 {
        int hash;
        struct sock *raw_sk;
-        struct iphdr *iph;
+        const struct iphdr *iph;
        struct net *net;
        hash = protocol & (RAW_HTABLE_SIZE - 1);
@@ -272,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
        read_lock(&raw_v4_hashinfo.lock);
        raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
        if (raw_sk != NULL) {
-                iph = (struct iphdr *)skb->data;
+                iph = (const struct iphdr *)skb->data;
                net = dev_net(skb->dev);
                while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
@@ -280,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
                                                skb->dev->ifindex)) != NULL) {
                        raw_err(raw_sk, skb, info);
                        raw_sk = sk_next(raw_sk);
-                        iph = (struct iphdr *)skb->data;
+                        iph = (const struct iphdr *)skb->data;
                }
        }
        read_unlock(&raw_v4_hashinfo.lock);
@@ -313,9 +314,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
        return 0;
 }
-static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
-                        struct rtable **rtp,
+                           void *from, size_t length,
-                        unsigned int flags)
+                           struct rtable **rtp,
+                           unsigned int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
@@ -326,7 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
        struct rtable *rt = *rtp;
        if (length > rt->dst.dev->mtu) {
-                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
+                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
                               rt->dst.dev->mtu);
                return -EMSGSIZE;
        }
@@ -371,7 +373,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
        if (iphlen >= sizeof(*iph)) {
                if (!iph->saddr)
-                        iph->saddr = rt->rt_src;
+                        iph->saddr = fl4->saddr;
                iph->check   = 0;
                iph->tot_len = htons(length);
                if (!iph->id)
@@ -401,7 +403,7 @@ error:
        return err;
 }
-static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
 {
        struct iovec *iov;
        u8 __user *type = NULL;
@@ -417,7 +419,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
                if (!iov)
                        continue;
-                switch (fl->proto) {
+                switch (fl4->flowi4_proto) {
                case IPPROTO_ICMP:
                        /* check if one-byte field is readable or not. */
                        if (iov->iov_base && iov->iov_len < 1)
@@ -432,8 +434,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
                                code = iov->iov_base;
                        if (type && code) {
-                                if (get_user(fl->fl_icmp_type, type) ||
+                                if (get_user(fl4->fl4_icmp_type, type) ||
-                                    get_user(fl->fl_icmp_code, code))
+                                    get_user(fl4->fl4_icmp_code, code))
                                        return -EFAULT;
                                probed = 1;
                        }
@@ -454,11 +456,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        struct inet_sock *inet = inet_sk(sk);
        struct ipcm_cookie ipc;
        struct rtable *rt = NULL;
+        struct flowi4 fl4;
        int free = 0;
        __be32 daddr;
        __be32 saddr;
        u8  tos;
        int err;
+        struct ip_options_data opt_copy;
        err = -EMSGSIZE;
        if (len > 0xFFFF)
@@ -505,7 +509,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        ipc.addr = inet->inet_saddr;
        ipc.opt = NULL;
-        ipc.shtx.flags = 0;
+        ipc.tx_flags = 0;
        ipc.oif = sk->sk_bound_dev_if;
        if (msg->msg_controllen) {
@@ -519,8 +523,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        saddr = ipc.addr;
        ipc.addr = daddr;
-        if (!ipc.opt)
+        if (!ipc.opt) {
-                ipc.opt = inet->opt;
+                struct ip_options_rcu *inet_opt;
+                rcu_read_lock();
+                inet_opt = rcu_dereference(inet->inet_opt);
+                if (inet_opt) {
+                        memcpy(&opt_copy, inet_opt,
+                               sizeof(*inet_opt) + inet_opt->opt.optlen);
+                        ipc.opt = &opt_copy.opt;
+                }
+                rcu_read_unlock();
+        }
        if (ipc.opt) {
                err = -EINVAL;
@@ -529,10 +543,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                 */
                if (inet->hdrincl)
                        goto done;
-                if (ipc.opt->srr) {
+                if (ipc.opt->opt.srr) {
                        if (!daddr)
                                goto done;
-                        daddr = ipc.opt->faddr;
+                        daddr = ipc.opt->opt.faddr;
                }
        }
        tos = RT_CONN_FLAGS(sk);
@@ -546,27 +560,24 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        saddr = inet->mc_addr;
        }
-        {
+        flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
-                struct flowi fl = { .oif = ipc.oif,
+                           RT_SCOPE_UNIVERSE,
-                                    .mark = sk->sk_mark,
+                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-                                    .nl_u = { .ip4_u =
+                           FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0);
-                                              { .daddr = daddr,
-                                                .saddr = saddr,
-                                                .tos = tos } },
-                                    .proto = inet->hdrincl ? IPPROTO_RAW :
-                                                             sk->sk_protocol,
-                                  };
-                if (!inet->hdrincl) {
-                        err = raw_probe_proto_opt(&fl, msg);
-                        if (err)
-                                goto done;
-                }
-                security_sk_classify_flow(sk, &fl);
+        if (!inet->hdrincl) {
-                err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1);
+                err = raw_probe_proto_opt(&fl4, msg);
+                if (err)
+                        goto done;
        }
-        if (err)
+        security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+        rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+        if (IS_ERR(rt)) {
+                err = PTR_ERR(rt);
+                rt = NULL;
                goto done;
+        }
        err = -EACCES;
        if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -577,19 +588,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 back_from_confirm:
        if (inet->hdrincl)
-                err = raw_send_hdrinc(sk, msg->msg_iov, len,
+                err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
-                                        &rt, msg->msg_flags);
+                                      &rt, msg->msg_flags);
         else {
                if (!ipc.addr)
-                        ipc.addr = rt->rt_dst;
+                        ipc.addr = fl4.daddr;
                lock_sock(sk);
-                err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
+                err = ip_append_data(sk, &fl4, ip_generic_getfrag,
-                                        &ipc, &rt, msg->msg_flags);
+                                     msg->msg_iov, len, 0,
+                                     &ipc, &rt, msg->msg_flags);
                if (err)
                        ip_flush_pending_frames(sk);
                else if (!(msg->msg_flags & MSG_MORE)) {
-                        err = ip_push_pending_frames(sk);
+                        err = ip_push_pending_frames(sk, &fl4);
                        if (err == -ENOBUFS && !inet->recverr)
                                err = 0;
                }
@@ -616,7 +628,7 @@ do_confirm:
 static void raw_close(struct sock *sk, long timeout)
 {
        /*
-         * Raw sockets may have direct kernel refereneces. Kill them.
+         * Raw sockets may have direct kernel references. Kill them.
         */
        ip_ra_control(sk, 0, NULL);
@@ -839,6 +851,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
        }
 }
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+        switch (cmd) {
+        case SIOCOUTQ:
+        case SIOCINQ:
+                return -ENOIOCTLCMD;
+        default:
+#ifdef CONFIG_IP_MROUTE
+                return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+                return -ENOIOCTLCMD;
+#endif
+        }
+}
+#endif
 struct proto raw_prot = {
        .name              = "RAW",
        .owner             = THIS_MODULE,
@@ -861,6 +890,7 @@ struct proto raw_prot = {
 #ifdef CONFIG_COMPAT
        .compat_setsockopt = compat_raw_setsockopt,
        .compat_getsockopt = compat_raw_getsockopt,
+        .compat_ioctl      = compat_raw_ioctl,
 #endif
 };
@@ -949,7 +979,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
              srcp  = inet->inet_num;
        seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
                i, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ac6559cb54f9..aa13ef105110 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
 #include <linux/sysctl.h>
 #endif
-#define RT_FL_TOS(oldflp) \
+#define RT_FL_TOS(oldflp4) \
-    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+    ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 #define IP_MAX_MTU      0xFFF0
@@ -131,42 +131,80 @@ static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly       = 256;
 static int rt_chain_length_max __read_mostly    = 20;
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
 /*
 *      Interface to generic destination cache.
 */
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
+static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 static void              ipv4_dst_destroy(struct dst_entry *dst);
-static void              ipv4_dst_ifdown(struct dst_entry *dst,
-                                         struct net_device *dev, int how);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void              ipv4_link_failure(struct sk_buff *skb);
 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 static int rt_garbage_collect(struct dst_ops *ops);
+static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+                            int how)
+{
+}
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        struct inet_peer *peer;
+        u32 *p = NULL;
+        if (!rt->peer)
+                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = rt->peer;
+        if (peer) {
+                u32 *old_p = __DST_METRICS_PTR(old);
+                unsigned long prev, new;
+                p = peer->metrics;
+                if (inet_metrics_new(peer))
+                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+                new = (unsigned long) p;
+                prev = cmpxchg(&dst->_metrics, old, new);
+                if (prev != old) {
+                        p = __DST_METRICS_PTR(prev);
+                        if (prev & DST_METRICS_READ_ONLY)
+                                p = NULL;
+                } else {
+                        if (rt->fi) {
+                                fib_info_put(rt->fi);
+                                rt->fi = NULL;
+                        }
+                }
+        }
+        return p;
+}
 static struct dst_ops ipv4_dst_ops = {
        .family =               AF_INET,
        .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   rt_garbage_collect,
        .check =                ipv4_dst_check,
+        .default_advmss =       ipv4_default_advmss,
+        .default_mtu =          ipv4_default_mtu,
+        .cow_metrics =          ipv4_cow_metrics,
        .destroy =              ipv4_dst_destroy,
        .ifdown =               ipv4_dst_ifdown,
        .negative_advice =      ipv4_negative_advice,
        .link_failure =         ipv4_link_failure,
        .update_pmtu =          ip_rt_update_pmtu,
        .local_out =            __ip_local_out,
-        .entries =              ATOMIC_INIT(0),
 };
 #define ECN_OR_COST(class)      TC_PRIO_##class
 const __u8 ip_tos2prio[16] = {
        TC_PRIO_BESTEFFORT,
-        ECN_OR_COST(FILLER),
+        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BULK,
@@ -199,7 +237,7 @@ const __u8 ip_tos2prio[16] = {
 */
 struct rt_hash_bucket {
-        struct rtable   *chain;
+        struct rtable __rcu     *chain;
 };
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -281,7 +319,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
        struct rtable *r = NULL;
        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-                if (!rt_hash_table[st->bucket].chain)
+                if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
                        continue;
                rcu_read_lock_bh();
                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
@@ -301,17 +339,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 {
        struct rt_cache_iter_state *st = seq->private;
-        r = r->dst.rt_next;
+        r = rcu_dereference_bh(r->dst.rt_next);
        while (!r) {
                rcu_read_unlock_bh();
                do {
                        if (--st->bucket < 0)
                                return NULL;
-                } while (!rt_hash_table[st->bucket].chain);
+                } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
                rcu_read_lock_bh();
-                r = rt_hash_table[st->bucket].chain;
+                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
        }
-        return rcu_dereference_bh(r);
+        return r;
 }
 static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -382,12 +420,11 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
                        (__force u32)r->rt_gateway,
                        r->rt_flags, atomic_read(&r->dst.__refcnt),
                        r->dst.__use, 0, (__force u32)r->rt_src,
-                        (dst_metric(&r->dst, RTAX_ADVMSS) ?
+                        dst_metric_advmss(&r->dst) + 40,
-                             (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
                        dst_metric(&r->dst, RTAX_WINDOW),
                        (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
                              dst_metric(&r->dst, RTAX_RTTVAR)),
-                        r->fl.fl4_tos,
+                        r->rt_key_tos,
                        r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
                        r->dst.hh ? (r->dst.hh->hh_output ==
                                       dev_queue_xmit) : 0,
@@ -466,7 +503,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
-                   atomic_read(&ipv4_dst_ops.entries),
+                   dst_entries_get_slow(&ipv4_dst_ops),
                   st->in_hit,
                   st->in_slow_tot,
                   st->in_slow_mc,
@@ -510,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
        .release = seq_release,
 };
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
 static int rt_acct_proc_show(struct seq_file *m, void *v)
 {
        struct ip_rt_acct *dst, *src;
@@ -563,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
        if (!pde)
                goto err2;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
        if (!pde)
                goto err3;
 #endif
        return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
 err3:
        remove_proc_entry("rt_cache", net->proc_net_stat);
 #endif
@@ -584,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
 {
        remove_proc_entry("rt_cache", net->proc_net_stat);
        remove_proc_entry("rt_cache", net->proc_net);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        remove_proc_entry("rt_acct", net->proc_net);
 #endif
 }
@@ -622,13 +659,13 @@ static inline int rt_fast_clean(struct rtable *rth)
        /* Kill broadcast/multicast entries very aggresively, if they
           collide in hash table with more useful entries */
        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-                rth->fl.iif && rth->dst.rt_next;
+                rt_is_input_route(rth) && rth->dst.rt_next;
 }
 static inline int rt_valuable(struct rtable *rth)
 {
        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-                rth->dst.expires;
+                (rth->peer && rth->peer->pmtu_expires);
 }
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -639,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
        if (atomic_read(&rth->dst.__refcnt))
                goto out;
-        ret = 1;
-        if (rth->dst.expires &&
-            time_after_eq(jiffies, rth->dst.expires))
-                goto out;
        age = jiffies - rth->dst.lastuse;
-        ret = 0;
        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
            (age <= tmo2 && rt_valuable(rth)))
                goto out;
@@ -667,7 +698,7 @@ static inline u32 rt_score(struct rtable *rt)
        if (rt_valuable(rt))
                score |= (1<<31);
-        if (!rt->fl.iif ||
+        if (rt_is_output_route(rt) ||
            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
                score |= (1<<30);
@@ -680,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
                net->ipv4.sysctl_rt_cache_rebuild_count;
 }
-static inline bool compare_hash_inputs(const struct flowi *fl1,
+static inline bool compare_hash_inputs(const struct rtable *rt1,
-                                        const struct flowi *fl2)
+                                       const struct rtable *rt2)
 {
-        return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
+        return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-                ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
+                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-                (fl1->iif ^ fl2->iif)) == 0);
+                (rt1->rt_iif ^ rt2->rt_iif)) == 0);
 }
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 {
-        return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
+        return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-                ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
+                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-                (fl1->mark ^ fl2->mark) |
+                (rt1->rt_mark ^ rt2->rt_mark) |
-                (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
+                (rt1->rt_key_tos ^ rt2->rt_key_tos) |
-                (fl1->oif ^ fl2->oif) |
+                (rt1->rt_oif ^ rt2->rt_oif) |
-                (fl1->iif ^ fl2->iif)) == 0;
+                (rt1->rt_iif ^ rt2->rt_iif)) == 0;
 }
 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -713,55 +744,48 @@ static inline int rt_is_expired(struct rtable *rth)
 * Can be called by a softirq or a process.
 * In the later case, we want to be reschedule if necessary
 */
-static void rt_do_flush(int process_context)
+static void rt_do_flush(struct net *net, int process_context)
 {
        unsigned int i;
        struct rtable *rth, *next;
-        struct rtable * tail;
        for (i = 0; i <= rt_hash_mask; i++) {
+                struct rtable __rcu **pprev;
+                struct rtable *list;
                if (process_context && need_resched())
                        cond_resched();
-                rth = rt_hash_table[i].chain;
+                rth = rcu_dereference_raw(rt_hash_table[i].chain);
                if (!rth)
                        continue;
                spin_lock_bh(rt_hash_lock_addr(i));
-#ifdef CONFIG_NET_NS
-                {
-                struct rtable ** prev, * p;
-                rth = rt_hash_table[i].chain;
+                list = NULL;
+                pprev = &rt_hash_table[i].chain;
+                rth = rcu_dereference_protected(*pprev,
+                        lockdep_is_held(rt_hash_lock_addr(i)));
-                /* defer releasing the head of the list after spin_unlock */
+                while (rth) {
-                for (tail = rth; tail; tail = tail->dst.rt_next)
+                        next = rcu_dereference_protected(rth->dst.rt_next,
-                        if (!rt_is_expired(tail))
+                                lockdep_is_held(rt_hash_lock_addr(i)));
-                                break;
-                if (rth != tail)
+                        if (!net ||
-                        rt_hash_table[i].chain = tail;
+                            net_eq(dev_net(rth->dst.dev), net)) {
+                                rcu_assign_pointer(*pprev, next);
-                /* call rt_free on entries after the tail requiring flush */
+                                rcu_assign_pointer(rth->dst.rt_next, list);
-                prev = &rt_hash_table[i].chain;
+                                list = rth;
-                for (p = *prev; p; p = next) {
-                        next = p->dst.rt_next;
-                        if (!rt_is_expired(p)) {
-                                prev = &p->dst.rt_next;
                        } else {
-                                *prev = next;
+                                pprev = &rth->dst.rt_next;
-                                rt_free(p);
                        }
+                        rth = next;
                }
-                }
-#else
-                rth = rt_hash_table[i].chain;
-                rt_hash_table[i].chain = NULL;
-                tail = NULL;
-#endif
                spin_unlock_bh(rt_hash_lock_addr(i));
-                for (; rth != tail; rth = next) {
+                for (; list; list = next) {
-                        next = rth->dst.rt_next;
+                        next = rcu_dereference_protected(list->dst.rt_next, 1);
-                        rt_free(rth);
+                        rt_free(list);
                }
        }
 }
@@ -789,104 +813,15 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
        const struct rtable *aux = head;
        while (aux != rth) {
-                if (compare_hash_inputs(&aux->fl, &rth->fl))
+                if (compare_hash_inputs(aux, rth))
                        return 0;
-                aux = aux->dst.rt_next;
+                aux = rcu_dereference_protected(aux->dst.rt_next, 1);
        }
        return ONE;
 }
-static void rt_check_expire(void)
-{
-        static unsigned int rover;
-        unsigned int i = rover, goal;
-        struct rtable *rth, **rthp;
-        unsigned long samples = 0;
-        unsigned long sum = 0, sum2 = 0;
-        unsigned long delta;
-        u64 mult;
-        delta = jiffies - expires_ljiffies;
-        expires_ljiffies = jiffies;
-        mult = ((u64)delta) << rt_hash_log;
-        if (ip_rt_gc_timeout > 1)
-                do_div(mult, ip_rt_gc_timeout);
-        goal = (unsigned int)mult;
-        if (goal > rt_hash_mask)
-                goal = rt_hash_mask + 1;
-        for (; goal > 0; goal--) {
-                unsigned long tmo = ip_rt_gc_timeout;
-                unsigned long length;
-                i = (i + 1) & rt_hash_mask;
-                rthp = &rt_hash_table[i].chain;
-                if (need_resched())
-                        cond_resched();
-                samples++;
-                if (*rthp == NULL)
-                        continue;
-                length = 0;
-                spin_lock_bh(rt_hash_lock_addr(i));
-                while ((rth = *rthp) != NULL) {
-                        prefetch(rth->dst.rt_next);
-                        if (rt_is_expired(rth)) {
-                                *rthp = rth->dst.rt_next;
-                                rt_free(rth);
-                                continue;
-                        }
-                        if (rth->dst.expires) {
-                                /* Entry is expired even if it is in use */
-                                if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
-                                        tmo >>= 1;
-                                        rthp = &rth->dst.rt_next;
-                                        /*
-                                         * We only count entries on
-                                         * a chain with equal hash inputs once
-                                         * so that entries for different QOS
-                                         * levels, and other non-hash input
-                                         * attributes don't unfairly skew
-                                         * the length computation
-                                         */
-                                        length += has_noalias(rt_hash_table[i].chain, rth);
-                                        continue;
-                                }
-                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
-                                goto nofree;
-                        /* Cleanup aged off entries. */
-                        *rthp = rth->dst.rt_next;
-                        rt_free(rth);
-                }
-                spin_unlock_bh(rt_hash_lock_addr(i));
-                sum += length;
-                sum2 += length*length;
-        }
-        if (samples) {
-                unsigned long avg = sum / samples;
-                unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
-                rt_chain_length_max = max_t(unsigned long,
-                                        ip_rt_gc_elasticity,
-                                        (avg + 4*sd) >> FRACT_BITS);
-        }
-        rover = i;
-}
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
-        rt_check_expire();
-        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
 /*
- * Pertubation of rt_genid by a small quantity [1..256]
+ * Perturbation of rt_genid by a small quantity [1..256]
 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 * many times (2^24) without giving recent rt_genid.
 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
@@ -907,13 +842,13 @@ void rt_cache_flush(struct net *net, int delay)
 {
        rt_cache_invalidate(net);
        if (delay >= 0)
-                rt_do_flush(!in_softirq());
+                rt_do_flush(net, !in_softirq());
 }
 /* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(void)
+void rt_cache_flush_batch(struct net *net)
 {
-        rt_do_flush(!in_softirq());
+        rt_do_flush(net, !in_softirq());
 }
 static void rt_emergency_hash_rebuild(struct net *net)
@@ -942,9 +877,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
        static unsigned long last_gc;
        static int rover;
        static int equilibrium;
-        struct rtable *rth, **rthp;
+        struct rtable *rth;
+        struct rtable __rcu **rthp;
        unsigned long now = jiffies;
        int goal;
+        int entries = dst_entries_get_fast(&ipv4_dst_ops);
        /*
         * Garbage collection is pretty expensive,
@@ -954,28 +891,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
        RT_CACHE_STAT_INC(gc_total);
        if (now - last_gc < ip_rt_gc_min_interval &&
-            atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+            entries < ip_rt_max_size) {
                RT_CACHE_STAT_INC(gc_ignored);
                goto out;
        }
+        entries = dst_entries_get_slow(&ipv4_dst_ops);
        /* Calculate number of entries, which we want to expire now. */
-        goal = atomic_read(&ipv4_dst_ops.entries) -
+        goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-                (ip_rt_gc_elasticity << rt_hash_log);
        if (goal <= 0) {
                if (equilibrium < ipv4_dst_ops.gc_thresh)
                        equilibrium = ipv4_dst_ops.gc_thresh;
-                goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+                goal = entries - equilibrium;
                if (goal > 0) {
                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-                        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+                        goal = entries - equilibrium;
                }
        } else {
                /* We are in dangerous area. Try to reduce cache really
                 * aggressively.
                 */
                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-                equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+                equilibrium = entries - goal;
        }
        if (now - last_gc >= ip_rt_gc_min_interval)
@@ -995,7 +932,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
                        k = (k + 1) & rt_hash_mask;
                        rthp = &rt_hash_table[k].chain;
                        spin_lock_bh(rt_hash_lock_addr(k));
-                        while ((rth = *rthp) != NULL) {
+                        while ((rth = rcu_dereference_protected(*rthp,
+                                        lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
                                if (!rt_is_expired(rth) &&
                                        !rt_may_expire(rth, tmo, expire)) {
                                        tmo >>= 1;
@@ -1030,16 +968,14 @@ static int rt_garbage_collect(struct dst_ops *ops)
                        break;
                expire >>= 1;
-#if RT_CACHE_DEBUG >= 2
-                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
-                                atomic_read(&ipv4_dst_ops.entries), goal, i);
-#endif
-                if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+                if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
                        goto out;
        } while (!in_softirq() && time_before_eq(jiffies, now));
-        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+        if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+                goto out;
+        if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
                goto out;
        if (net_ratelimit())
                printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,12 +985,9 @@ static int rt_garbage_collect(struct dst_ops *ops)
 work_done:
        expire += ip_rt_gc_min_interval;
        if (expire > ip_rt_gc_timeout ||
-            atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+            dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+            dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
                expire = ip_rt_gc_timeout;
-#if RT_CACHE_DEBUG >= 2
-        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
-                        atomic_read(&ipv4_dst_ops.entries), goal, rover);
-#endif
 out:    return 0;
 }
@@ -1068,17 +1001,17 @@ static int slow_chain_length(const struct rtable *head)
        while (rth) {
                length += has_noalias(head, rth);
-                rth = rth->dst.rt_next;
+                rth = rcu_dereference_protected(rth->dst.rt_next, 1);
        }
        return length >> FRACT_BITS;
 }
-static int rt_intern_hash(unsigned hash, struct rtable *rt,
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
-                          struct rtable **rp, struct sk_buff *skb, int ifindex)
+                                     struct sk_buff *skb, int ifindex)
 {
-        struct rtable   *rth, **rthp;
+        struct rtable   *rth, *cand;
+        struct rtable __rcu **rthp, **candp;
        unsigned long   now;
-        struct rtable *cand, **candp;
        u32             min_score;
        int             chain_length;
        int attempts = !in_softirq();
@@ -1102,36 +1035,37 @@ restart:
                 * Note that we do rt_free on this new route entry, so that
                 * once its refcount hits zero, we are still able to reap it
                 * (Thanks Alexey)
-                 * Note also the rt_free uses call_rcu.  We don't actually
+                 * Note: To avoid expensive rcu stuff for this uncached dst,
-                 * need rcu protection here, this is just our path to get
+                 * we set DST_NOCACHE so that dst_release() can free dst without
-                 * on the route gc list.
+                 * waiting a grace period.
                 */
-                if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+                rt->dst.flags |= DST_NOCACHE;
+                if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
                        int err = arp_bind_neighbour(&rt->dst);
                        if (err) {
                                if (net_ratelimit())
                                        printk(KERN_WARNING
                                            "Neighbour table failure & not caching routes.\n");
-                                rt_drop(rt);
+                                ip_rt_put(rt);
-                                return err;
+                                return ERR_PTR(err);
                        }
                }
-                rt_free(rt);
                goto skip_hashing;
        }
        rthp = &rt_hash_table[hash].chain;
        spin_lock_bh(rt_hash_lock_addr(hash));
-        while ((rth = *rthp) != NULL) {
+        while ((rth = rcu_dereference_protected(*rthp,
+                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
                if (rt_is_expired(rth)) {
                        *rthp = rth->dst.rt_next;
                        rt_free(rth);
                        continue;
                }
-                if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
+                if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
                        /* Put it first */
                        *rthp = rth->dst.rt_next;
                        /*
@@ -1151,11 +1085,9 @@ restart:
                        spin_unlock_bh(rt_hash_lock_addr(hash));
                        rt_drop(rt);
-                        if (rp)
+                        if (skb)
-                                *rp = rth;
-                        else
                                skb_dst_set(skb, &rth->dst);
-                        return 0;
+                        return rth;
                }
                if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1196,7 +1128,7 @@ restart:
                        rt_emergency_hash_rebuild(net);
                        spin_unlock_bh(rt_hash_lock_addr(hash));
-                        hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+                        hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
                                        ifindex, rt_genid(net));
                        goto restart;
                }
@@ -1205,14 +1137,14 @@ restart:
        /* Try to bind route to arp only if it is output
           route or unicast forwarding path.
         */
-        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+        if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
                int err = arp_bind_neighbour(&rt->dst);
                if (err) {
                        spin_unlock_bh(rt_hash_lock_addr(hash));
                        if (err != -ENOBUFS) {
                                rt_drop(rt);
-                                return err;
+                                return ERR_PTR(err);
                        }
                        /* Neighbour tables are full and nothing
@@ -1233,25 +1165,15 @@ restart:
                        if (net_ratelimit())
                                printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
                        rt_drop(rt);
-                        return -ENOBUFS;
+                        return ERR_PTR(-ENOBUFS);
                }
        }
        rt->dst.rt_next = rt_hash_table[hash].chain;
-#if RT_CACHE_DEBUG >= 2
-        if (rt->dst.rt_next) {
-                struct rtable *trt;
-                printk(KERN_DEBUG "rt_cache @%02x: %pI4",
-                       hash, &rt->rt_dst);
-                for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
-                        printk(" . %pI4", &trt->rt_dst);
-                printk("\n");
-        }
-#endif
        /*
         * Since lookup is lockfree, we must make sure
-         * previous writes to rt are comitted to memory
+         * previous writes to rt are committed to memory
         * before making rt visible to other CPUS.
         */
        rcu_assign_pointer(rt_hash_table[hash].chain, rt);
@@ -1259,28 +1181,28 @@ restart:
        spin_unlock_bh(rt_hash_lock_addr(hash));
 skip_hashing:
-        if (rp)
+        if (skb)
-                *rp = rt;
-        else
                skb_dst_set(skb, &rt->dst);
-        return 0;
+        return rt;
+}
+static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
+static u32 rt_peer_genid(void)
+{
+        return atomic_read(&__rt_peer_genid);
 }
-void rt_bind_peer(struct rtable *rt, int create)
+void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
 {
-        static DEFINE_SPINLOCK(rt_peer_lock);
        struct inet_peer *peer;
-        peer = inet_getpeer(rt->rt_dst, create);
+        peer = inet_getpeer_v4(daddr, create);
-        spin_lock_bh(&rt_peer_lock);
+        if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
-        if (rt->peer == NULL) {
-                rt->peer = peer;
-                peer = NULL;
-        }
-        spin_unlock_bh(&rt_peer_lock);
-        if (peer)
                inet_putpeer(peer);
+        else
+                rt->rt_peer_genid = rt_peer_genid();
 }
 /*
@@ -1309,7 +1231,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
        if (rt) {
                if (rt->peer == NULL)
-                        rt_bind_peer(rt, 1);
+                        rt_bind_peer(rt, rt->rt_dst, 1);
                /* If peer is attached to destination, it is never detached,
                   so that we need not to grab a lock to dereference it.
@@ -1328,12 +1250,14 @@ EXPORT_SYMBOL(__ip_select_ident);
 static void rt_del(unsigned hash, struct rtable *rt)
 {
-        struct rtable **rthp, *aux;
+        struct rtable __rcu **rthp;
+        struct rtable *aux;
        rthp = &rt_hash_table[hash].chain;
        spin_lock_bh(rt_hash_lock_addr(hash));
        ip_rt_put(rt);
-        while ((aux = *rthp) != NULL) {
+        while ((aux = rcu_dereference_protected(*rthp,
+                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
                if (aux == rt || rt_is_expired(aux)) {
                        *rthp = aux->dst.rt_next;
                        rt_free(aux);
@@ -1348,12 +1272,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                    __be32 saddr, struct net_device *dev)
 {
-        int i, k;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
-        struct rtable *rth, **rthp;
+        struct inet_peer *peer;
-        __be32  skeys[2] = { saddr, 0 };
-        int  ikeys[2] = { dev->ifindex, 0 };
-        struct netevent_redirect netevent;
        struct net *net;
        if (!in_dev)
@@ -1365,9 +1285,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
            ipv4_is_zeronet(new_gw))
                goto reject_redirect;
-        if (!rt_caching(net))
-                goto reject_redirect;
        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
                        goto reject_redirect;
@@ -1378,93 +1295,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                        goto reject_redirect;
        }
-        for (i = 0; i < 2; i++) {
+        peer = inet_getpeer_v4(daddr, 1);
-                for (k = 0; k < 2; k++) {
+        if (peer) {
-                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
+                peer->redirect_learned.a4 = new_gw;
-                                                rt_genid(net));
-                        rthp=&rt_hash_table[hash].chain;
-                        while ((rth = rcu_dereference(*rthp)) != NULL) {
-                                struct rtable *rt;
-                                if (rth->fl.fl4_dst != daddr ||
-                                    rth->fl.fl4_src != skeys[i] ||
-                                    rth->fl.oif != ikeys[k] ||
-                                    rth->fl.iif != 0 ||
-                                    rt_is_expired(rth) ||
-                                    !net_eq(dev_net(rth->dst.dev), net)) {
-                                        rthp = &rth->dst.rt_next;
-                                        continue;
-                                }
-                                if (rth->rt_dst != daddr ||
-                                    rth->rt_src != saddr ||
-                                    rth->dst.error ||
-                                    rth->rt_gateway != old_gw ||
-                                    rth->dst.dev != dev)
-                                        break;
-                                dst_hold(&rth->dst);
-                                rt = dst_alloc(&ipv4_dst_ops);
-                                if (rt == NULL) {
-                                        ip_rt_put(rth);
-                                        return;
-                                }
-                                /* Copy all the information. */
-                                *rt = *rth;
-                                rt->dst.__use           = 1;
-                                atomic_set(&rt->dst.__refcnt, 1);
-                                rt->dst.child           = NULL;
-                                if (rt->dst.dev)
-                                        dev_hold(rt->dst.dev);
-                                if (rt->idev)
-                                        in_dev_hold(rt->idev);
-                                rt->dst.obsolete        = -1;
-                                rt->dst.lastuse = jiffies;
-                                rt->dst.path            = &rt->dst;
-                                rt->dst.neighbour       = NULL;
-                                rt->dst.hh              = NULL;
-#ifdef CONFIG_XFRM
-                                rt->dst.xfrm            = NULL;
-#endif
-                                rt->rt_genid            = rt_genid(net);
-                                rt->rt_flags            |= RTCF_REDIRECTED;
-                                /* Gateway is different ... */
-                                rt->rt_gateway          = new_gw;
-                                /* Redirect received -> path was valid */
-                                dst_confirm(&rth->dst);
-                                if (rt->peer)
-                                        atomic_inc(&rt->peer->refcnt);
-                                if (arp_bind_neighbour(&rt->dst) ||
-                                    !(rt->dst.neighbour->nud_state &
-                                            NUD_VALID)) {
-                                        if (rt->dst.neighbour)
-                                                neigh_event_send(rt->dst.neighbour, NULL);
-                                        ip_rt_put(rth);
-                                        rt_drop(rt);
-                                        goto do_next;
-                                }
-                                netevent.old = &rth->dst;
+                inet_putpeer(peer);
-                                netevent.new = &rt->dst;
-                                call_netevent_notifiers(NETEVENT_REDIRECT,
-                                                        &netevent);
-                                rt_del(hash, rth);
+                atomic_inc(&__rt_peer_genid);
-                                if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
-                                        ip_rt_put(rt);
-                                goto do_next;
-                        }
-                do_next:
-                        ;
-                }
        }
        return;
@@ -1479,6 +1316,23 @@ reject_redirect:
        ;
 }
+static bool peer_pmtu_expired(struct inet_peer *peer)
+{
+        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
+        return orig &&
+               time_after_eq(jiffies, orig) &&
+               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
+}
+static bool peer_pmtu_cleaned(struct inet_peer *peer)
+{
+        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
+        return orig &&
+               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
+}
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
        struct rtable *rt = (struct rtable *)dst;
@@ -1488,18 +1342,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
                if (dst->obsolete > 0) {
                        ip_rt_put(rt);
                        ret = NULL;
-                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+                } else if (rt->rt_flags & RTCF_REDIRECTED) {
-                           (rt->dst.expires &&
+                        unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-                            time_after_eq(jiffies, rt->dst.expires))) {
+                                                rt->rt_oif,
-                        unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
-                                                rt->fl.oif,
                                                rt_genid(dev_net(dst->dev)));
-#if RT_CACHE_DEBUG >= 1
-                        printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
-                                &rt->rt_dst, rt->fl.fl4_tos);
-#endif
                        rt_del(hash, rt);
                        ret = NULL;
+                } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
+                        dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
                }
        }
        return ret;
@@ -1525,6 +1375,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 {
        struct rtable *rt = skb_rtable(skb);
        struct in_device *in_dev;
+        struct inet_peer *peer;
        int log_martians;
        rcu_read_lock();
@@ -1536,36 +1387,44 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
        rcu_read_unlock();
+        if (!rt->peer)
+                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = rt->peer;
+        if (!peer) {
+                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+                return;
+        }
        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
         */
-        if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
+        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
-                rt->dst.rate_tokens = 0;
+                peer->rate_tokens = 0;
        /* Too many ignored redirects; do not send anything
         * set dst.rate_last to the last seen redirected packet.
         */
-        if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
+        if (peer->rate_tokens >= ip_rt_redirect_number) {
-                rt->dst.rate_last = jiffies;
+                peer->rate_last = jiffies;
                return;
        }
        /* Check for load limit; set rate_last to the latest sent
         * redirect.
         */
-        if (rt->dst.rate_tokens == 0 ||
+        if (peer->rate_tokens == 0 ||
            time_after(jiffies,
-                       (rt->dst.rate_last +
+                       (peer->rate_last +
-                        (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
+                        (ip_rt_redirect_load << peer->rate_tokens)))) {
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
-                rt->dst.rate_last = jiffies;
+                peer->rate_last = jiffies;
-                ++rt->dst.rate_tokens;
+                ++peer->rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
                if (log_martians &&
-                    rt->dst.rate_tokens == ip_rt_redirect_number &&
+                    peer->rate_tokens == ip_rt_redirect_number &&
                    net_ratelimit())
                        printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
-                                &rt->rt_src, rt->rt_iif,
+                               &ip_hdr(skb)->saddr, rt->rt_iif,
                                &rt->rt_dst, &rt->rt_gateway);
 #endif
        }
@@ -1574,7 +1433,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 static int ip_error(struct sk_buff *skb)
 {
        struct rtable *rt = skb_rtable(skb);
+        struct inet_peer *peer;
        unsigned long now;
+        bool send;
        int code;
        switch (rt->dst.error) {
@@ -1594,15 +1455,24 @@ static int ip_error(struct sk_buff *skb)
                        break;
        }
-        now = jiffies;
+        if (!rt->peer)
-        rt->dst.rate_tokens += now - rt->dst.rate_last;
+                rt_bind_peer(rt, rt->rt_dst, 1);
-        if (rt->dst.rate_tokens > ip_rt_error_burst)
+        peer = rt->peer;
-                rt->dst.rate_tokens = ip_rt_error_burst;
-        rt->dst.rate_last = now;
+        send = true;
-        if (rt->dst.rate_tokens >= ip_rt_error_cost) {
+        if (peer) {
-                rt->dst.rate_tokens -= ip_rt_error_cost;
+                now = jiffies;
+                peer->rate_tokens += now - peer->rate_last;
+                if (peer->rate_tokens > ip_rt_error_burst)
+                        peer->rate_tokens = ip_rt_error_burst;
+                peer->rate_last = now;
+                if (peer->rate_tokens >= ip_rt_error_cost)
+                        peer->rate_tokens -= ip_rt_error_cost;
+                else
+                        send = false;
+        }
+        if (send)
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
-        }
 out:    kfree_skb(skb);
        return 0;
@@ -1626,88 +1496,148 @@ static inline unsigned short guess_mtu(unsigned short old_mtu)
        return 68;
 }
-unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
+unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
                                 unsigned short new_mtu,
                                 struct net_device *dev)
 {
-        int i, k;
        unsigned short old_mtu = ntohs(iph->tot_len);
-        struct rtable *rth;
-        int  ikeys[2] = { dev->ifindex, 0 };
-        __be32  skeys[2] = { iph->saddr, 0, };
-        __be32  daddr = iph->daddr;
        unsigned short est_mtu = 0;
+        struct inet_peer *peer;
-        for (k = 0; k < 2; k++) {
+        peer = inet_getpeer_v4(iph->daddr, 1);
-                for (i = 0; i < 2; i++) {
+        if (peer) {
-                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
+                unsigned short mtu = new_mtu;
-                                                rt_genid(net));
+                if (new_mtu < 68 || new_mtu >= old_mtu) {
-                        rcu_read_lock();
+                        /* BSD 4.2 derived systems incorrectly adjust
-                        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+                         * tot_len by the IP header length, and report
-                             rth = rcu_dereference(rth->dst.rt_next)) {
+                         * a zero MTU in the ICMP message.
-                                unsigned short mtu = new_mtu;
+                         */
+                        if (mtu == 0 &&
-                                if (rth->fl.fl4_dst != daddr ||
+                            old_mtu >= 68 + (iph->ihl << 2))
-                                    rth->fl.fl4_src != skeys[i] ||
+                                old_mtu -= iph->ihl << 2;
-                                    rth->rt_dst != daddr ||
+                        mtu = guess_mtu(old_mtu);
-                                    rth->rt_src != iph->saddr ||
+                }
-                                    rth->fl.oif != ikeys[k] ||
-                                    rth->fl.iif != 0 ||
-                                    dst_metric_locked(&rth->dst, RTAX_MTU) ||
-                                    !net_eq(dev_net(rth->dst.dev), net) ||
-                                    rt_is_expired(rth))
-                                        continue;
-                                if (new_mtu < 68 || new_mtu >= old_mtu) {
+                if (mtu < ip_rt_min_pmtu)
+                        mtu = ip_rt_min_pmtu;
+                if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+                        unsigned long pmtu_expires;
-                                        /* BSD 4.2 compatibility hack :-( */
+                        pmtu_expires = jiffies + ip_rt_mtu_expires;
-                                        if (mtu == 0 &&
+                        if (!pmtu_expires)
-                                            old_mtu >= dst_mtu(&rth->dst) &&
+                                pmtu_expires = 1UL;
-                                            old_mtu >= 68 + (iph->ihl << 2))
-                                                old_mtu -= iph->ihl << 2;
-                                        mtu = guess_mtu(old_mtu);
+                        est_mtu = mtu;
-                                }
+                        peer->pmtu_learned = mtu;
-                                if (mtu <= dst_mtu(&rth->dst)) {
+                        peer->pmtu_expires = pmtu_expires;
-                                        if (mtu < dst_mtu(&rth->dst)) {
-                                                dst_confirm(&rth->dst);
-                                                if (mtu < ip_rt_min_pmtu) {
-                                                        mtu = ip_rt_min_pmtu;
-                                                        rth->dst.metrics[RTAX_LOCK-1] |=
-                                                                (1 << RTAX_MTU);
-                                                }
-                                                rth->dst.metrics[RTAX_MTU-1] = mtu;
-                                                dst_set_expires(&rth->dst,
-                                                        ip_rt_mtu_expires);
-                                        }
-                                        est_mtu = mtu;
-                                }
-                        }
-                        rcu_read_unlock();
                }
+                inet_putpeer(peer);
+                atomic_inc(&__rt_peer_genid);
        }
        return est_mtu ? : new_mtu;
 }
+static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+{
+        unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
+        if (!expires)
+                return;
+        if (time_before(jiffies, expires)) {
+                u32 orig_dst_mtu = dst_mtu(dst);
+                if (peer->pmtu_learned < orig_dst_mtu) {
+                        if (!peer->pmtu_orig)
+                                peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+                        dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
+                }
+        } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
+                dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
+}
 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
-        if (dst_mtu(dst) > mtu && mtu >= 68 &&
+        struct rtable *rt = (struct rtable *) dst;
-            !(dst_metric_locked(dst, RTAX_MTU))) {
+        struct inet_peer *peer;
-                if (mtu < ip_rt_min_pmtu) {
+        dst_confirm(dst);
+        if (!rt->peer)
+                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = rt->peer;
+        if (peer) {
+                unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
+                if (mtu < ip_rt_min_pmtu)
                        mtu = ip_rt_min_pmtu;
-                        dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
+                if (!pmtu_expires || mtu < peer->pmtu_learned) {
+                        pmtu_expires = jiffies + ip_rt_mtu_expires;
+                        if (!pmtu_expires)
+                                pmtu_expires = 1UL;
+                        peer->pmtu_learned = mtu;
+                        peer->pmtu_expires = pmtu_expires;
+                        atomic_inc(&__rt_peer_genid);
+                        rt->rt_peer_genid = rt_peer_genid();
                }
-                dst->metrics[RTAX_MTU-1] = mtu;
+                check_peer_pmtu(dst, peer);
-                dst_set_expires(dst, ip_rt_mtu_expires);
+        }
-                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+}
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        __be32 orig_gw = rt->rt_gateway;
+        dst_confirm(&rt->dst);
+        neigh_release(rt->dst.neighbour);
+        rt->dst.neighbour = NULL;
+        rt->rt_gateway = peer->redirect_learned.a4;
+        if (arp_bind_neighbour(&rt->dst) ||
+            !(rt->dst.neighbour->nud_state & NUD_VALID)) {
+                if (rt->dst.neighbour)
+                        neigh_event_send(rt->dst.neighbour, NULL);
+                rt->rt_gateway = orig_gw;
+                return -EAGAIN;
+        } else {
+                rt->rt_flags |= RTCF_REDIRECTED;
+                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
+                                        rt->dst.neighbour);
        }
+        return 0;
 }
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
-        if (rt_is_expired((struct rtable *)dst))
+        struct rtable *rt = (struct rtable *) dst;
+        if (rt_is_expired(rt))
                return NULL;
+        if (rt->rt_peer_genid != rt_peer_genid()) {
+                struct inet_peer *peer;
+                if (!rt->peer)
+                        rt_bind_peer(rt, rt->rt_dst, 0);
+                peer = rt->peer;
+                if (peer) {
+                        check_peer_pmtu(dst, peer);
+                        if (peer->redirect_learned.a4 &&
+                            peer->redirect_learned.a4 != rt->rt_gateway) {
+                                if (check_peer_redir(dst, peer))
+                                        return NULL;
+                        }
+                }
+                rt->rt_peer_genid = rt_peer_genid();
+        }
        return dst;
 }
@@ -1715,33 +1645,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 {
        struct rtable *rt = (struct rtable *) dst;
        struct inet_peer *peer = rt->peer;
-        struct in_device *idev = rt->idev;
+        if (rt->fi) {
+                fib_info_put(rt->fi);
+                rt->fi = NULL;
+        }
        if (peer) {
                rt->peer = NULL;
                inet_putpeer(peer);
        }
-        if (idev) {
-                rt->idev = NULL;
-                in_dev_put(idev);
-        }
 }
-static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
-                            int how)
-{
-        struct rtable *rt = (struct rtable *) dst;
-        struct in_device *idev = rt->idev;
-        if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
-                struct in_device *loopback_idev =
-                        in_dev_get(dev_net(dev)->loopback_dev);
-                if (loopback_idev) {
-                        rt->idev = loopback_idev;
-                        in_dev_put(idev);
-                }
-        }
-}
 static void ipv4_link_failure(struct sk_buff *skb)
 {
@@ -1750,8 +1664,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
        rt = skb_rtable(skb);
-        if (rt)
+        if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
-                dst_set_expires(&rt->dst, 0);
+                dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
 }
 static int ip_rt_bug(struct sk_buff *skb)
@@ -1760,6 +1674,7 @@ static int ip_rt_bug(struct sk_buff *skb)
                &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
                skb->dev ? skb->dev->name : "?");
        kfree_skb(skb);
+        WARN_ON(1);
        return 0;
 }
@@ -1772,23 +1687,39 @@ static int ip_rt_bug(struct sk_buff *skb)
   in IP options!
 */
-void ip_rt_get_source(u8 *addr, struct rtable *rt)
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 {
        __be32 src;
-        struct fib_result res;
-        if (rt->fl.iif == 0)
+        if (rt_is_output_route(rt))
-                src = rt->rt_src;
+                src = ip_hdr(skb)->saddr;
-        else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
+        else {
-                src = FIB_RES_PREFSRC(res);
+                struct fib_result res;
-                fib_res_put(&res);
+                struct flowi4 fl4;
-        } else
+                struct iphdr *iph;
-                src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+                iph = ip_hdr(skb);
+                memset(&fl4, 0, sizeof(fl4));
+                fl4.daddr = iph->daddr;
+                fl4.saddr = iph->saddr;
+                fl4.flowi4_tos = iph->tos;
+                fl4.flowi4_oif = rt->dst.dev->ifindex;
+                fl4.flowi4_iif = skb->dev->ifindex;
+                fl4.flowi4_mark = skb->mark;
+                rcu_read_lock();
+                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+                else
+                        src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
                                        RT_SCOPE_UNIVERSE);
+                rcu_read_unlock();
+        }
        memcpy(addr, &src, 4);
 }
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
 static void set_class_tag(struct rtable *rt, u32 tag)
 {
        if (!(rt->dst.tclassid & 0xFFFF))
@@ -1798,46 +1729,107 @@ static void set_class_tag(struct rtable *rt, u32 tag)
 }
 #endif
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
 {
-        struct fib_info *fi = res->fi;
+        unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+        if (advmss == 0) {
+                advmss = max_t(unsigned int, dst->dev->mtu - 40,
+                               ip_rt_min_advmss);
+                if (advmss > 65535 - 40)
+                        advmss = 65535 - 40;
+        }
+        return advmss;
+}
+static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
+{
+        unsigned int mtu = dst->dev->mtu;
+        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+                const struct rtable *rt = (const struct rtable *) dst;
+                if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+                        mtu = 576;
+        }
+        if (mtu > IP_MAX_MTU)
+                mtu = IP_MAX_MTU;
+        return mtu;
+}
+static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
+                            struct fib_info *fi)
+{
+        struct inet_peer *peer;
+        int create = 0;
+        /* If a peer entry exists for this destination, we must hook
+         * it up in order to get at cached metrics.
+         */
+        if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
+                create = 1;
+        rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
+        if (peer) {
+                rt->rt_peer_genid = rt_peer_genid();
+                if (inet_metrics_new(peer))
+                        memcpy(peer->metrics, fi->fib_metrics,
+                               sizeof(u32) * RTAX_MAX);
+                dst_init_metrics(&rt->dst, peer->metrics, false);
+                check_peer_pmtu(&rt->dst, peer);
+                if (peer->redirect_learned.a4 &&
+                    peer->redirect_learned.a4 != rt->rt_gateway) {
+                        rt->rt_gateway = peer->redirect_learned.a4;
+                        rt->rt_flags |= RTCF_REDIRECTED;
+                }
+        } else {
+                if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+                        rt->fi = fi;
+                        atomic_inc(&fi->fib_clntref);
+                }
+                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+        }
+}
+static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
+                           const struct fib_result *res,
+                           struct fib_info *fi, u16 type, u32 itag)
+{
+        struct dst_entry *dst = &rt->dst;
        if (fi) {
                if (FIB_RES_GW(*res) &&
                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
                        rt->rt_gateway = FIB_RES_GW(*res);
-                memcpy(rt->dst.metrics, fi->fib_metrics,
+                rt_init_metrics(rt, fl4, fi);
-                       sizeof(rt->dst.metrics));
+#ifdef CONFIG_IP_ROUTE_CLASSID
-                if (fi->fib_mtu == 0) {
+                dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
-                        rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
-                        if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
-                            rt->rt_gateway != rt->rt_dst &&
-                            rt->dst.dev->mtu > 576)
-                                rt->dst.metrics[RTAX_MTU-1] = 576;
-                }
-#ifdef CONFIG_NET_CLS_ROUTE
-                rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
 #endif
-        } else
+        }
-                rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
+        if (dst_mtu(dst) > IP_MAX_MTU)
-        if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
+                dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
-                rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+        if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
-        if (dst_mtu(&rt->dst) > IP_MAX_MTU)
+                dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
-                rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
-        if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
+#ifdef CONFIG_IP_ROUTE_CLASSID
-                rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
-                                       ip_rt_min_advmss);
-        if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
-                rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
-#ifdef CONFIG_NET_CLS_ROUTE
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        set_class_tag(rt, fib_rules_tclass(res));
 #endif
        set_class_tag(rt, itag);
 #endif
-        rt->rt_type = res->type;
+}
+static struct rtable *rt_dst_alloc(struct net_device *dev,
+                                   bool nopolicy, bool noxfrm)
+{
+        return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
+                         DST_HOST |
+                         (nopolicy ? DST_NOPOLICY : 0) |
+                         (noxfrm ? DST_NOXFRM : 0));
 }
 /* called in rcu_read_lock() section */
@@ -1865,42 +1857,38 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                        goto e_inval;
                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
        } else {
-                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
-                                          &itag, 0);
+                                          &itag);
                if (err < 0)
                        goto e_err;
        }
-        rth = dst_alloc(&ipv4_dst_ops);
+        rth = rt_dst_alloc(init_net.loopback_dev,
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
        if (!rth)
                goto e_nobufs;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+        rth->dst.tclassid = itag;
+#endif
        rth->dst.output = ip_rt_bug;
-        rth->dst.obsolete = -1;
-        atomic_set(&rth->dst.__refcnt, 1);
+        rth->rt_key_dst = daddr;
-        rth->dst.flags= DST_HOST;
+        rth->rt_key_src = saddr;
-        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
+        rth->rt_genid   = rt_genid(dev_net(dev));
-                rth->dst.flags |= DST_NOPOLICY;
+        rth->rt_flags   = RTCF_MULTICAST;
-        rth->fl.fl4_dst = daddr;
+        rth->rt_type    = RTN_MULTICAST;
+        rth->rt_key_tos = tos;
        rth->rt_dst     = daddr;
-        rth->fl.fl4_tos = tos;
-        rth->fl.mark    = skb->mark;
-        rth->fl.fl4_src = saddr;
        rth->rt_src     = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+        rth->rt_route_iif = dev->ifindex;
-        rth->dst.tclassid = itag;
+        rth->rt_iif     = dev->ifindex;
-#endif
+        rth->rt_oif     = 0;
-        rth->rt_iif     =
+        rth->rt_mark    = skb->mark;
-        rth->fl.iif     = dev->ifindex;
-        rth->dst.dev    = init_net.loopback_dev;
-        dev_hold(rth->dst.dev);
-        rth->idev       = in_dev_get(rth->dst.dev);
-        rth->fl.oif     = 0;
        rth->rt_gateway = daddr;
        rth->rt_spec_dst= spec_dst;
-        rth->rt_genid   = rt_genid(dev_net(dev));
+        rth->rt_peer_genid = 0;
-        rth->rt_flags   = RTCF_MULTICAST;
+        rth->peer = NULL;
-        rth->rt_type    = RTN_MULTICAST;
+        rth->fi = NULL;
        if (our) {
                rth->dst.input= ip_local_deliver;
                rth->rt_flags |= RTCF_LOCAL;
@@ -1913,7 +1901,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        RT_CACHE_STAT_INC(in_slow_mc);
        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-        return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
+        rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
+        return IS_ERR(rth) ? PTR_ERR(rth) : 0;
 e_nobufs:
        return -ENOBUFS;
@@ -1956,7 +1945,7 @@ static void ip_handle_martian_source(struct net_device *dev,
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
-                           struct fib_result *res,
+                           const struct fib_result *res,
                           struct in_device *in_dev,
                           __be32 daddr, __be32 saddr, u32 tos,
                           struct rtable **result)
@@ -1978,8 +1967,8 @@ static int __mkroute_input(struct sk_buff *skb,
        }
-        err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
+        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
-                                  in_dev->dev, &spec_dst, &itag, skb->mark);
+                                  in_dev->dev, &spec_dst, &itag);
        if (err < 0) {
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                                         saddr);
@@ -2010,42 +1999,36 @@ static int __mkroute_input(struct sk_buff *skb,
                }
        }
+        rth = rt_dst_alloc(out_dev->dev,
-        rth = dst_alloc(&ipv4_dst_ops);
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
+                           IN_DEV_CONF_GET(out_dev, NOXFRM));
        if (!rth) {
                err = -ENOBUFS;
                goto cleanup;
        }
-        atomic_set(&rth->dst.__refcnt, 1);
+        rth->rt_key_dst = daddr;
-        rth->dst.flags= DST_HOST;
+        rth->rt_key_src = saddr;
-        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
+        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
-                rth->dst.flags |= DST_NOPOLICY;
+        rth->rt_flags = flags;
-        if (IN_DEV_CONF_GET(out_dev, NOXFRM))
+        rth->rt_type = res->type;
-                rth->dst.flags |= DST_NOXFRM;
+        rth->rt_key_tos = tos;
-        rth->fl.fl4_dst = daddr;
        rth->rt_dst     = daddr;
-        rth->fl.fl4_tos = tos;
-        rth->fl.mark    = skb->mark;
-        rth->fl.fl4_src = saddr;
        rth->rt_src     = saddr;
+        rth->rt_route_iif = in_dev->dev->ifindex;
+        rth->rt_iif     = in_dev->dev->ifindex;
+        rth->rt_oif     = 0;
+        rth->rt_mark    = skb->mark;
        rth->rt_gateway = daddr;
-        rth->rt_iif     =
-                rth->fl.iif     = in_dev->dev->ifindex;
-        rth->dst.dev    = (out_dev)->dev;
-        dev_hold(rth->dst.dev);
-        rth->idev       = in_dev_get(rth->dst.dev);
-        rth->fl.oif     = 0;
        rth->rt_spec_dst= spec_dst;
+        rth->rt_peer_genid = 0;
+        rth->peer = NULL;
+        rth->fi = NULL;
-        rth->dst.obsolete = -1;
        rth->dst.input = ip_forward;
        rth->dst.output = ip_output;
-        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
-        rt_set_nexthop(rth, res, itag);
-        rth->rt_flags = flags;
+        rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
        *result = rth;
        err = 0;
@@ -2055,7 +2038,7 @@ static int __mkroute_input(struct sk_buff *skb,
 static int ip_mkroute_input(struct sk_buff *skb,
                            struct fib_result *res,
-                            const struct flowi *fl,
+                            const struct flowi4 *fl4,
                            struct in_device *in_dev,
                            __be32 daddr, __be32 saddr, u32 tos)
 {
@@ -2064,8 +2047,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
        unsigned hash;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-        if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
+        if (res->fi && res->fi->fib_nhs > 1)
-                fib_select_multipath(fl, res);
+                fib_select_multipath(res);
 #endif
        /* create a routing cache entry */
@@ -2074,9 +2057,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
                return err;
        /* put it into the cache */
-        hash = rt_hash(daddr, saddr, fl->iif,
+        hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
                       rt_genid(dev_net(rth->dst.dev)));
-        return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
+        rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
+        if (IS_ERR(rth))
+                return PTR_ERR(rth);
+        return 0;
 }
 /*
@@ -2087,6 +2073,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 *      Such approach solves two big problems:
 *      1. Not simplex devices are handled properly.
 *      2. IP spoofing attempts are filtered with 100% of guarantee.
+ *      called with rcu_read_lock()
 */
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2094,21 +2081,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 {
        struct fib_result res;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
-        struct flowi fl = { .nl_u = { .ip4_u =
+        struct flowi4   fl4;
-                                      { .daddr = daddr,
-                                        .saddr = saddr,
-                                        .tos = tos,
-                                        .scope = RT_SCOPE_UNIVERSE,
-                                      } },
-                            .mark = skb->mark,
-                            .iif = dev->ifindex };
        unsigned        flags = 0;
        u32             itag = 0;
        struct rtable * rth;
        unsigned        hash;
        __be32          spec_dst;
        int             err = -EINVAL;
-        int             free_res = 0;
        struct net    * net = dev_net(dev);
        /* IP on this device is disabled. */
@@ -2124,7 +2103,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
            ipv4_is_loopback(saddr))
                goto martian_source;
-        if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;
        /* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2112,25 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(saddr))
                goto martian_source;
-        if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
+        if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
-            ipv4_is_loopback(daddr))
                goto martian_destination;
        /*
         *      Now we are ready to route packet.
         */
-        if ((err = fib_lookup(net, &fl, &res)) != 0) {
+        fl4.flowi4_oif = 0;
+        fl4.flowi4_iif = dev->ifindex;
+        fl4.flowi4_mark = skb->mark;
+        fl4.flowi4_tos = tos;
+        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+        fl4.daddr = daddr;
+        fl4.saddr = saddr;
+        err = fib_lookup(net, &fl4, &res);
+        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
                        goto e_hostunreach;
                goto no_route;
        }
-        free_res = 1;
        RT_CACHE_STAT_INC(in_slow_tot);
@@ -2153,9 +2138,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                goto brd_input;
        if (res.type == RTN_LOCAL) {
-                err = fib_validate_source(saddr, daddr, tos,
+                err = fib_validate_source(skb, saddr, daddr, tos,
-                                             net->loopback_dev->ifindex,
+                                          net->loopback_dev->ifindex,
-                                             dev, &spec_dst, &itag, skb->mark);
+                                          dev, &spec_dst, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
                if (err)
@@ -2169,10 +2154,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (res.type != RTN_UNICAST)
                goto martian_destination;
-        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+        err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
-done:
-        if (free_res)
-                fib_res_put(&res);
 out:    return err;
 brd_input:
@@ -2182,8 +2164,8 @@ brd_input:
        if (ipv4_is_zeronet(saddr))
                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
        else {
-                err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
-                                          &itag, skb->mark);
+                                          &itag);
                if (err < 0)
                        goto martian_source_keep_err;
                if (err)
@@ -2194,45 +2176,48 @@ brd_input:
        RT_CACHE_STAT_INC(in_brd);
 local_input:
-        rth = dst_alloc(&ipv4_dst_ops);
+        rth = rt_dst_alloc(net->loopback_dev,
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
        if (!rth)
                goto e_nobufs;
+        rth->dst.input= ip_local_deliver;
        rth->dst.output= ip_rt_bug;
-        rth->dst.obsolete = -1;
+#ifdef CONFIG_IP_ROUTE_CLASSID
-        rth->rt_genid = rt_genid(net);
+        rth->dst.tclassid = itag;
+#endif
-        atomic_set(&rth->dst.__refcnt, 1);
+        rth->rt_key_dst = daddr;
-        rth->dst.flags= DST_HOST;
+        rth->rt_key_src = saddr;
-        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
+        rth->rt_genid = rt_genid(net);
-                rth->dst.flags |= DST_NOPOLICY;
+        rth->rt_flags   = flags|RTCF_LOCAL;
-        rth->fl.fl4_dst = daddr;
+        rth->rt_type    = res.type;
+        rth->rt_key_tos = tos;
        rth->rt_dst     = daddr;
-        rth->fl.fl4_tos = tos;
-        rth->fl.mark    = skb->mark;
-        rth->fl.fl4_src = saddr;
        rth->rt_src     = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
 #endif
-        rth->rt_iif     =
+        rth->rt_route_iif = dev->ifindex;
-        rth->fl.iif     = dev->ifindex;
+        rth->rt_iif     = dev->ifindex;
-        rth->dst.dev    = net->loopback_dev;
+        rth->rt_oif     = 0;
-        dev_hold(rth->dst.dev);
+        rth->rt_mark    = skb->mark;
-        rth->idev       = in_dev_get(rth->dst.dev);
        rth->rt_gateway = daddr;
        rth->rt_spec_dst= spec_dst;
-        rth->dst.input= ip_local_deliver;
+        rth->rt_peer_genid = 0;
-        rth->rt_flags   = flags|RTCF_LOCAL;
+        rth->peer = NULL;
+        rth->fi = NULL;
        if (res.type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags   &= ~RTCF_LOCAL;
        }
-        rth->rt_type    = res.type;
+        hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
-        hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
+        rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
-        err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
+        err = 0;
-        goto done;
+        if (IS_ERR(rth))
+                err = PTR_ERR(rth);
+        goto out;
 no_route:
        RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2240,21 @@ martian_destination:
 e_hostunreach:
        err = -EHOSTUNREACH;
-        goto done;
+        goto out;
 e_inval:
        err = -EINVAL;
-        goto done;
+        goto out;
 e_nobufs:
        err = -ENOBUFS;
-        goto done;
+        goto out;
 martian_source:
        err = -EINVAL;
 martian_source_keep_err:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
-        goto done;
+        goto out;
 }
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2293,12 +2278,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
             rth = rcu_dereference(rth->dst.rt_next)) {
-                if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
+                if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
-                     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
+                     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
-                     (rth->fl.iif ^ iif) |
+                     (rth->rt_iif ^ iif) |
-                     rth->fl.oif |
+                     rth->rt_oif |
-                     (rth->fl.fl4_tos ^ tos)) == 0 &&
+                     (rth->rt_key_tos ^ tos)) == 0 &&
-                    rth->fl.mark == skb->mark &&
+                    rth->rt_mark == skb->mark &&
                    net_eq(dev_net(rth->dst.dev), net) &&
                    !rt_is_expired(rth)) {
                        if (noref) {
@@ -2331,8 +2316,8 @@ skip_cache:
                struct in_device *in_dev = __in_dev_get_rcu(dev);
                if (in_dev) {
-                        int our = ip_check_mc(in_dev, daddr, saddr,
+                        int our = ip_check_mc_rcu(in_dev, daddr, saddr,
-                                              ip_hdr(skb)->protocol);
+                                                  ip_hdr(skb)->protocol);
                        if (our
 #ifdef CONFIG_IP_MROUTE
                                ||
@@ -2355,108 +2340,95 @@ skip_cache:
 }
 EXPORT_SYMBOL(ip_route_input_common);
-static int __mkroute_output(struct rtable **result,
+/* called with rcu_read_lock() */
-                            struct fib_result *res,
+static struct rtable *__mkroute_output(const struct fib_result *res,
-                            const struct flowi *fl,
+                                       const struct flowi4 *fl4,
-                            const struct flowi *oldflp,
+                                       __be32 orig_daddr, __be32 orig_saddr,
-                            struct net_device *dev_out,
+                                       int orig_oif, struct net_device *dev_out,
-                            unsigned flags)
+                                       unsigned int flags)
 {
-        struct rtable *rth;
+        struct fib_info *fi = res->fi;
+        u32 tos = RT_FL_TOS(fl4);
        struct in_device *in_dev;
-        u32 tos = RT_FL_TOS(oldflp);
+        u16 type = res->type;
-        int err = 0;
+        struct rtable *rth;
-        if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+        if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
-        if (fl->fl4_dst == htonl(0xFFFFFFFF))
+        if (ipv4_is_lbcast(fl4->daddr))
-                res->type = RTN_BROADCAST;
+                type = RTN_BROADCAST;
-        else if (ipv4_is_multicast(fl->fl4_dst))
+        else if (ipv4_is_multicast(fl4->daddr))
-                res->type = RTN_MULTICAST;
+                type = RTN_MULTICAST;
-        else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+        else if (ipv4_is_zeronet(fl4->daddr))
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;
-        /* get work reference to inet device */
+        in_dev = __in_dev_get_rcu(dev_out);
-        in_dev = in_dev_get(dev_out);
        if (!in_dev)
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
-        if (res->type == RTN_BROADCAST) {
+        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
-                if (res->fi) {
+                fi = NULL;
-                        fib_info_put(res->fi);
+        } else if (type == RTN_MULTICAST) {
-                        res->fi = NULL;
+                flags |= RTCF_MULTICAST | RTCF_LOCAL;
-                }
+                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
-        } else if (res->type == RTN_MULTICAST) {
+                                     fl4->flowi4_proto))
-                flags |= RTCF_MULTICAST|RTCF_LOCAL;
-                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
-                                 oldflp->proto))
                        flags &= ~RTCF_LOCAL;
                /* If multicast route do not exist use
-                   default one, but do not gateway in this case.
+                 * default one, but do not gateway in this case.
-                   Yes, it is hack.
+                 * Yes, it is hack.
                 */
-                if (res->fi && res->prefixlen < 4) {
+                if (fi && res->prefixlen < 4)
-                        fib_info_put(res->fi);
+                        fi = NULL;
-                        res->fi = NULL;
-                }
        }
+        rth = rt_dst_alloc(dev_out,
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
+                           IN_DEV_CONF_GET(in_dev, NOXFRM));
+        if (!rth)
+                return ERR_PTR(-ENOBUFS);
-        rth = dst_alloc(&ipv4_dst_ops);
+        rth->dst.output = ip_output;
-        if (!rth) {
-                err = -ENOBUFS;
-                goto cleanup;
-        }
-        atomic_set(&rth->dst.__refcnt, 1);
+        rth->rt_key_dst = orig_daddr;
-        rth->dst.flags= DST_HOST;
+        rth->rt_key_src = orig_saddr;
-        if (IN_DEV_CONF_GET(in_dev, NOXFRM))
-                rth->dst.flags |= DST_NOXFRM;
-        if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
-                rth->dst.flags |= DST_NOPOLICY;
-        rth->fl.fl4_dst = oldflp->fl4_dst;
-        rth->fl.fl4_tos = tos;
-        rth->fl.fl4_src = oldflp->fl4_src;
-        rth->fl.oif     = oldflp->oif;
-        rth->fl.mark    = oldflp->mark;
-        rth->rt_dst     = fl->fl4_dst;
-        rth->rt_src     = fl->fl4_src;
-        rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
-        /* get references to the devices that are to be hold by the routing
-           cache entry */
-        rth->dst.dev    = dev_out;
-        dev_hold(dev_out);
-        rth->idev       = in_dev_get(dev_out);
-        rth->rt_gateway = fl->fl4_dst;
-        rth->rt_spec_dst= fl->fl4_src;
-        rth->dst.output=ip_output;
-        rth->dst.obsolete = -1;
        rth->rt_genid = rt_genid(dev_net(dev_out));
+        rth->rt_flags   = flags;
+        rth->rt_type    = type;
+        rth->rt_key_tos = tos;
+        rth->rt_dst     = fl4->daddr;
+        rth->rt_src     = fl4->saddr;
+        rth->rt_route_iif = 0;
+        rth->rt_iif     = orig_oif ? : dev_out->ifindex;
+        rth->rt_oif     = orig_oif;
+        rth->rt_mark    = fl4->flowi4_mark;
+        rth->rt_gateway = fl4->daddr;
+        rth->rt_spec_dst= fl4->saddr;
+        rth->rt_peer_genid = 0;
+        rth->peer = NULL;
+        rth->fi = NULL;
        RT_CACHE_STAT_INC(out_slow_tot);
        if (flags & RTCF_LOCAL) {
                rth->dst.input = ip_local_deliver;
-                rth->rt_spec_dst = fl->fl4_dst;
+                rth->rt_spec_dst = fl4->daddr;
        }
        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
-                rth->rt_spec_dst = fl->fl4_src;
+                rth->rt_spec_dst = fl4->saddr;
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
                        rth->dst.output = ip_mc_output;
                        RT_CACHE_STAT_INC(out_slow_mc);
                }
 #ifdef CONFIG_IP_MROUTE
-                if (res->type == RTN_MULTICAST) {
+                if (type == RTN_MULTICAST) {
                        if (IN_DEV_MFORWARD(in_dev) &&
-                            !ipv4_is_local_multicast(oldflp->fl4_dst)) {
+                            !ipv4_is_local_multicast(fl4->daddr)) {
                                rth->dst.input = ip_mr_input;
                                rth->dst.output = ip_mc_output;
                        }
@@ -2464,73 +2436,47 @@ static int __mkroute_output(struct rtable **result,
 #endif
        }
-        rt_set_nexthop(rth, res, 0);
+        rt_set_nexthop(rth, fl4, res, fi, type, 0);
-        rth->rt_flags = flags;
-        *result = rth;
- cleanup:
-        /* release work reference to inet device */
-        in_dev_put(in_dev);
-        return err;
-}
-static int ip_mkroute_output(struct rtable **rp,
-                             struct fib_result *res,
-                             const struct flowi *fl,
-                             const struct flowi *oldflp,
-                             struct net_device *dev_out,
-                             unsigned flags)
-{
-        struct rtable *rth = NULL;
-        int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
-        unsigned hash;
-        if (err == 0) {
-                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
-                               rt_genid(dev_net(dev_out)));
-                err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
-        }
-        return err;
+        return rth;
 }
 /*
 * Major route resolver routine.
+ * called with rcu_read_lock();
 */
-static int ip_route_output_slow(struct net *net, struct rtable **rp,
+static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
-                                const struct flowi *oldflp)
+{
-{
-        u32 tos = RT_FL_TOS(oldflp);
-        struct flowi fl = { .nl_u = { .ip4_u =
-                                      { .daddr = oldflp->fl4_dst,
-                                        .saddr = oldflp->fl4_src,
-                                        .tos = tos & IPTOS_RT_MASK,
-                                        .scope = ((tos & RTO_ONLINK) ?
-                                                  RT_SCOPE_LINK :
-                                                  RT_SCOPE_UNIVERSE),
-                                      } },
-                            .mark = oldflp->mark,
-                            .iif = net->loopback_dev->ifindex,
-                            .oif = oldflp->oif };
-        struct fib_result res;
-        unsigned flags = 0;
        struct net_device *dev_out = NULL;
-        int free_res = 0;
+        u32 tos = RT_FL_TOS(fl4);
-        int err;
+        unsigned int flags = 0;
+        struct fib_result res;
+        struct rtable *rth;
+        __be32 orig_daddr;
+        __be32 orig_saddr;
+        int orig_oif;
        res.fi          = NULL;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        res.r           = NULL;
 #endif
-        if (oldflp->fl4_src) {
+        orig_daddr = fl4->daddr;
-                err = -EINVAL;
+        orig_saddr = fl4->saddr;
-                if (ipv4_is_multicast(oldflp->fl4_src) ||
+        orig_oif = fl4->flowi4_oif;
-                    ipv4_is_lbcast(oldflp->fl4_src) ||
-                    ipv4_is_zeronet(oldflp->fl4_src))
+        fl4->flowi4_iif = net->loopback_dev->ifindex;
+        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+        rcu_read_lock();
+        if (fl4->saddr) {
+                rth = ERR_PTR(-EINVAL);
+                if (ipv4_is_multicast(fl4->saddr) ||
+                    ipv4_is_lbcast(fl4->saddr) ||
+                    ipv4_is_zeronet(fl4->saddr))
                        goto out;
                /* I removed check for oif == dev_out->oif here.
@@ -2541,11 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                      of another iface. --ANK
                 */
-                if (oldflp->oif == 0 &&
+                if (fl4->flowi4_oif == 0 &&
-                    (ipv4_is_multicast(oldflp->fl4_dst) ||
+                    (ipv4_is_multicast(fl4->daddr) ||
-                     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+                     ipv4_is_lbcast(fl4->daddr))) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-                        dev_out = ip_dev_find(net, oldflp->fl4_src);
+                        dev_out = __ip_dev_find(net, fl4->saddr, false);
                        if (dev_out == NULL)
                                goto out;
@@ -2564,67 +2510,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                           Luckily, this hack is good workaround.
                         */
-                        fl.oif = dev_out->ifindex;
+                        fl4->flowi4_oif = dev_out->ifindex;
                        goto make_route;
                }
-                if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-                        dev_out = ip_dev_find(net, oldflp->fl4_src);
+                        if (!__ip_dev_find(net, fl4->saddr, false))
-                        if (dev_out == NULL)
                                goto out;
-                        dev_put(dev_out);
-                        dev_out = NULL;
                }
        }
-        if (oldflp->oif) {
+        if (fl4->flowi4_oif) {
-                dev_out = dev_get_by_index(net, oldflp->oif);
+                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
-                err = -ENODEV;
+                rth = ERR_PTR(-ENODEV);
                if (dev_out == NULL)
                        goto out;
                /* RACE: Check return value of inet_select_addr instead. */
-                if (__in_dev_get_rtnl(dev_out) == NULL) {
+                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
-                        dev_put(dev_out);
+                        rth = ERR_PTR(-ENETUNREACH);
-                        goto out;       /* Wrong error code */
+                        goto out;
                }
+                if (ipv4_is_local_multicast(fl4->daddr) ||
-                if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
+                    ipv4_is_lbcast(fl4->daddr)) {
-                    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+                        if (!fl4->saddr)
-                        if (!fl.fl4_src)
+                                fl4->saddr = inet_select_addr(dev_out, 0,
-                                fl.fl4_src = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        goto make_route;
                }
-                if (!fl.fl4_src) {
+                if (fl4->saddr) {
-                        if (ipv4_is_multicast(oldflp->fl4_dst))
+                        if (ipv4_is_multicast(fl4->daddr))
-                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                fl4->saddr = inet_select_addr(dev_out, 0,
-                                                              fl.fl4_scope);
+                                                              fl4->flowi4_scope);
-                        else if (!oldflp->fl4_dst)
+                        else if (!fl4->daddr)
-                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_HOST);
                }
        }
-        if (!fl.fl4_dst) {
+        if (!fl4->daddr) {
-                fl.fl4_dst = fl.fl4_src;
+                fl4->daddr = fl4->saddr;
-                if (!fl.fl4_dst)
+                if (!fl4->daddr)
-                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
-                if (dev_out)
-                        dev_put(dev_out);
                dev_out = net->loopback_dev;
-                dev_hold(dev_out);
+                fl4->flowi4_oif = net->loopback_dev->ifindex;
-                fl.oif = net->loopback_dev->ifindex;
                res.type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }
-        if (fib_lookup(net, &fl, &res)) {
+        if (fib_lookup(net, fl4, &res)) {
                res.fi = NULL;
-                if (oldflp->oif) {
+                if (fl4->flowi4_oif) {
                        /* Apparently, routing tables are wrong. Assume,
                           that the destination is on link.
@@ -2643,98 +2582,100 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                           likely IPv6, but we do not.
                         */
-                        if (fl.fl4_src == 0)
+                        if (fl4->saddr == 0)
-                                fl.fl4_src = inet_select_addr(dev_out, 0,
+                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        res.type = RTN_UNICAST;
                        goto make_route;
                }
-                if (dev_out)
+                rth = ERR_PTR(-ENETUNREACH);
-                        dev_put(dev_out);
-                err = -ENETUNREACH;
                goto out;
        }
-        free_res = 1;
        if (res.type == RTN_LOCAL) {
-                if (!fl.fl4_src)
+                if (!fl4->saddr) {
-                        fl.fl4_src = fl.fl4_dst;
+                        if (res.fi->fib_prefsrc)
-                if (dev_out)
+                                fl4->saddr = res.fi->fib_prefsrc;
-                        dev_put(dev_out);
+                        else
+                                fl4->saddr = fl4->daddr;
+                }
                dev_out = net->loopback_dev;
-                dev_hold(dev_out);
+                fl4->flowi4_oif = dev_out->ifindex;
-                fl.oif = dev_out->ifindex;
-                if (res.fi)
-                        fib_info_put(res.fi);
                res.fi = NULL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-        if (res.fi->fib_nhs > 1 && fl.oif == 0)
+        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-                fib_select_multipath(&fl, &res);
+                fib_select_multipath(&res);
        else
 #endif
-        if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
+        if (!res.prefixlen &&
-                fib_select_default(net, &fl, &res);
+            res.table->tb_num_default > 1 &&
+            res.type == RTN_UNICAST && !fl4->flowi4_oif)
+                fib_select_default(&res);
-        if (!fl.fl4_src)
+        if (!fl4->saddr)
-                fl.fl4_src = FIB_RES_PREFSRC(res);
+                fl4->saddr = FIB_RES_PREFSRC(net, res);
-        if (dev_out)
-                dev_put(dev_out);
        dev_out = FIB_RES_DEV(res);
-        dev_hold(dev_out);
+        fl4->flowi4_oif = dev_out->ifindex;
-        fl.oif = dev_out->ifindex;
 make_route:
-        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+        rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
+                               dev_out, flags);
+        if (!IS_ERR(rth)) {
+                unsigned int hash;
+                hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
+                               rt_genid(dev_net(dev_out)));
+                rth = rt_intern_hash(hash, rth, NULL, orig_oif);
+        }
-        if (free_res)
+out:
-                fib_res_put(&res);
+        rcu_read_unlock();
-        if (dev_out)
+        return rth;
-                dev_put(dev_out);
-out:    return err;
 }
-int __ip_route_output_key(struct net *net, struct rtable **rp,
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-                          const struct flowi *flp)
 {
-        unsigned hash;
        struct rtable *rth;
+        unsigned int hash;
        if (!rt_caching(net))
                goto slow_output;
-        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
+        hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
        rcu_read_lock_bh();
        for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
                rth = rcu_dereference_bh(rth->dst.rt_next)) {
-                if (rth->fl.fl4_dst == flp->fl4_dst &&
+                if (rth->rt_key_dst == flp4->daddr &&
-                    rth->fl.fl4_src == flp->fl4_src &&
+                    rth->rt_key_src == flp4->saddr &&
-                    rth->fl.iif == 0 &&
+                    rt_is_output_route(rth) &&
-                    rth->fl.oif == flp->oif &&
+                    rth->rt_oif == flp4->flowi4_oif &&
-                    rth->fl.mark == flp->mark &&
+                    rth->rt_mark == flp4->flowi4_mark &&
-                    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+                    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
                    net_eq(dev_net(rth->dst.dev), net) &&
                    !rt_is_expired(rth)) {
                        dst_use(&rth->dst, jiffies);
                        RT_CACHE_STAT_INC(out_hit);
                        rcu_read_unlock_bh();
-                        *rp = rth;
+                        if (!flp4->saddr)
-                        return 0;
+                                flp4->saddr = rth->rt_src;
+                        if (!flp4->daddr)
+                                flp4->daddr = rth->rt_dst;
+                        return rth;
                }
                RT_CACHE_STAT_INC(out_hlist_search);
        }
        rcu_read_unlock_bh();
 slow_output:
-        return ip_route_output_slow(net, rp, flp);
+        return ip_route_output_slow(net, flp4);
 }
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
@@ -2743,95 +2684,96 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
        return NULL;
 }
+static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+{
+        return 0;
+}
 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 }
+static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
+                                          unsigned long old)
+{
+        return NULL;
+}
 static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                 =       AF_INET,
        .protocol               =       cpu_to_be16(ETH_P_IP),
        .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_blackhole_dst_check,
+        .default_mtu            =       ipv4_blackhole_default_mtu,
+        .default_advmss         =       ipv4_default_advmss,
        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-        .entries                =       ATOMIC_INIT(0),
+        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
 };
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
-static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
 {
-        struct rtable *ort = *rp;
+        struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
-        struct rtable *rt = (struct rtable *)
+        struct rtable *ort = (struct rtable *) dst_orig;
-                dst_alloc(&ipv4_dst_blackhole_ops);
        if (rt) {
                struct dst_entry *new = &rt->dst;
-                atomic_set(&new->__refcnt, 1);
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard;
-                memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
+                dst_copy_metrics(new, &ort->dst);
                new->dev = ort->dst.dev;
                if (new->dev)
                        dev_hold(new->dev);
-                rt->fl = ort->fl;
+                rt->rt_key_dst = ort->rt_key_dst;
+                rt->rt_key_src = ort->rt_key_src;
+                rt->rt_key_tos = ort->rt_key_tos;
+                rt->rt_route_iif = ort->rt_route_iif;
+                rt->rt_iif = ort->rt_iif;
+                rt->rt_oif = ort->rt_oif;
+                rt->rt_mark = ort->rt_mark;
-                rt->idev = ort->idev;
-                if (rt->idev)
-                        in_dev_hold(rt->idev);
                rt->rt_genid = rt_genid(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
                rt->rt_dst = ort->rt_dst;
                rt->rt_src = ort->rt_src;
-                rt->rt_iif = ort->rt_iif;
                rt->rt_gateway = ort->rt_gateway;
                rt->rt_spec_dst = ort->rt_spec_dst;
                rt->peer = ort->peer;
                if (rt->peer)
                        atomic_inc(&rt->peer->refcnt);
+                rt->fi = ort->fi;
+                if (rt->fi)
+                        atomic_inc(&rt->fi->fib_clntref);
                dst_free(new);
        }
-        dst_release(&(*rp)->dst);
+        dst_release(dst_orig);
-        *rp = rt;
-        return (rt ? 0 : -ENOMEM);
+        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
 }
-int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
-                         struct sock *sk, int flags)
+                                    struct sock *sk)
 {
-        int err;
+        struct rtable *rt = __ip_route_output_key(net, flp4);
-        if ((err = __ip_route_output_key(net, rp, flp)) != 0)
-                return err;
-        if (flp->proto) {
+        if (IS_ERR(rt))
-                if (!flp->fl4_src)
+                return rt;
-                        flp->fl4_src = (*rp)->rt_src;
-                if (!flp->fl4_dst)
-                        flp->fl4_dst = (*rp)->rt_dst;
-                err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
-                                    flags ? XFRM_LOOKUP_WAIT : 0);
-                if (err == -EREMOTE)
-                        err = ipv4_dst_blackhole(net, rp, flp);
-                return err;
+        if (flp4->flowi4_proto)
-        }
+                rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+                                                   flowi4_to_flowi(flp4),
+                                                   sk, 0);
-        return 0;
+        return rt;
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
-int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
-{
-        return ip_route_output_flow(net, rp, flp, NULL, 0);
-}
-EXPORT_SYMBOL(ip_route_output_key);
 static int rt_fill_info(struct net *net,
                        struct sk_buff *skb, u32 pid, u32 seq, int event,
                        int nowait, unsigned int flags)
@@ -2839,7 +2781,8 @@ static int rt_fill_info(struct net *net,
        struct rtable *rt = skb_rtable(skb);
        struct rtmsg *r;
        struct nlmsghdr *nlh;
-        long expires;
+        long expires = 0;
+        const struct inet_peer *peer = rt->peer;
        u32 id = 0, ts = 0, tsage = 0, error;
        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
@@ -2850,7 +2793,7 @@ static int rt_fill_info(struct net *net,
        r->rtm_family    = AF_INET;
        r->rtm_dst_len  = 32;
        r->rtm_src_len  = 0;
-        r->rtm_tos      = rt->fl.fl4_tos;
+        r->rtm_tos      = rt->rt_key_tos;
        r->rtm_table    = RT_TABLE_MAIN;
        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
        r->rtm_type     = rt->rt_type;
@@ -2862,48 +2805,52 @@ static int rt_fill_info(struct net *net,
        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
-        if (rt->fl.fl4_src) {
+        if (rt->rt_key_src) {
                r->rtm_src_len = 32;
-                NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
+                NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
        }
        if (rt->dst.dev)
                NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        if (rt->dst.tclassid)
                NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
 #endif
-        if (rt->fl.iif)
+        if (rt_is_input_route(rt))
                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
-        else if (rt->rt_src != rt->fl.fl4_src)
+        else if (rt->rt_src != rt->rt_key_src)
                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
        if (rt->rt_dst != rt->rt_gateway)
                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
-        if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
+        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
                goto nla_put_failure;
-        if (rt->fl.mark)
+        if (rt->rt_mark)
-                NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+                NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
        error = rt->dst.error;
-        expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+        if (peer) {
-        if (rt->peer) {
                inet_peer_refcheck(rt->peer);
-                id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
+                id = atomic_read(&peer->ip_id_count) & 0xffff;
-                if (rt->peer->tcp_ts_stamp) {
+                if (peer->tcp_ts_stamp) {
-                        ts = rt->peer->tcp_ts;
+                        ts = peer->tcp_ts;
-                        tsage = get_seconds() - rt->peer->tcp_ts_stamp;
+                        tsage = get_seconds() - peer->tcp_ts_stamp;
                }
+                expires = ACCESS_ONCE(peer->pmtu_expires);
+                if (expires)
+                        expires -= jiffies;
        }
-        if (rt->fl.iif) {
+        if (rt_is_input_route(rt)) {
 #ifdef CONFIG_IP_MROUTE
                __be32 dst = rt->rt_dst;
                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
-                        int err = ipmr_get_route(net, skb, r, nowait);
+                        int err = ipmr_get_route(net, skb,
+                                                 rt->rt_src, rt->rt_dst,
+                                                 r, nowait);
                        if (err <= 0) {
                                if (!nowait) {
                                        if (err == 0)
@@ -2917,7 +2864,7 @@ static int rt_fill_info(struct net *net,
                        }
                } else
 #endif
-                        NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
+                        NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
        }
        if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2991,18 +2938,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
-                struct flowi fl = {
+                struct flowi4 fl4 = {
-                        .nl_u = {
+                        .daddr = dst,
-                                .ip4_u = {
+                        .saddr = src,
-                                        .daddr = dst,
+                        .flowi4_tos = rtm->rtm_tos,
-                                        .saddr = src,
+                        .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-                                        .tos = rtm->rtm_tos,
+                        .flowi4_mark = mark,
-                                },
-                        },
-                        .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-                        .mark = mark,
                };
-                err = ip_route_output_key(net, &rt, &fl);
+                rt = ip_route_output_key(net, &fl4);
+                err = 0;
+                if (IS_ERR(rt))
+                        err = PTR_ERR(rt);
        }
        if (err)
@@ -3285,6 +3232,8 @@ static __net_init int rt_genid_init(struct net *net)
 {
        get_random_bytes(&net->ipv4.rt_genid,
                         sizeof(net->ipv4.rt_genid));
+        get_random_bytes(&net->ipv4.dev_addr_genid,
+                         sizeof(net->ipv4.dev_addr_genid));
        return 0;
 }
@@ -3293,9 +3242,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
 };
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
-#endif /* CONFIG_NET_CLS_ROUTE */
+#endif /* CONFIG_IP_ROUTE_CLASSID */
 static __initdata unsigned long rhash_entries;
 static int __init set_rhash_entries(char *str)
@@ -3311,7 +3260,7 @@ int __init ip_rt_init(void)
 {
        int rc = 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
@@ -3323,6 +3272,12 @@ int __init ip_rt_init(void)
        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+        if (dst_entries_init(&ipv4_dst_ops) < 0)
+                panic("IP: failed to allocate ipv4_dst_ops counter\n");
+        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
        rt_hash_table = (struct rt_hash_bucket *)
                alloc_large_system_hash("IP route cache",
                                        sizeof(struct rt_hash_bucket),
@@ -3342,14 +3297,6 @@ int __init ip_rt_init(void)
        devinet_init();
        ip_fib_init();
-        /* All the timers, started at system startup tend
-           to synchronize. Perturb it a bit.
-         */
-        INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
-        expires_ljiffies = jiffies;
-        schedule_delayed_work(&expires_work,
-                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
        if (ip_rt_proc_init())
                printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650cace2180d..26461492a847 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -321,10 +321,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
         * the ACK carries the same options again (see RFC1122 4.2.3.8)
         */
        if (opt && opt->optlen) {
-                int opt_size = sizeof(struct ip_options) + opt->optlen;
+                int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
                ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
-                if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
+                if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
                        kfree(ireq->opt);
                        ireq->opt = NULL;
                }
@@ -345,20 +345,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
         * no easy way to do this.
         */
        {
-                struct flowi fl = { .mark = sk->sk_mark,
+                struct flowi4 fl4;
-                                    .nl_u = { .ip4_u =
-                                              { .daddr = ((opt && opt->srr) ?
+                flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
-                                                          opt->faddr :
+                                   RT_SCOPE_UNIVERSE, IPPROTO_TCP,
-                                                          ireq->rmt_addr),
+                                   inet_sk_flowi_flags(sk),
-                                                .saddr = ireq->loc_addr,
+                                   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
-                                                .tos = RT_CONN_FLAGS(sk) } },
+                                   ireq->loc_addr, th->source, th->dest);
-                                    .proto = IPPROTO_TCP,
+                security_req_classify_flow(req, flowi4_to_flowi(&fl4));
-                                    .flags = inet_sk_flowi_flags(sk),
+                rt = ip_route_output_key(sock_net(sk), &fl4);
-                                    .uli_u = { .ports =
+                if (IS_ERR(rt)) {
-                                               { .sport = th->dest,
-                                                 .dport = th->source } } };
-                security_req_classify_flow(req, &fl);
-                if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
                        reqsk_free(req);
                        goto out;
                }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d96c1da4b17c..57d0752e239a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -13,6 +13,7 @@
 #include <linux/seqlock.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/nsproxy.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -21,11 +22,18 @@
 #include <net/udp.h>
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
+#include <net/ping.h>
 static int zero;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
@@ -64,6 +72,53 @@ static int ipv4_local_port_range(ctl_table *table, int write,
        return ret;
 }
+void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
+{
+        gid_t *data = table->data;
+        unsigned seq;
+        do {
+                seq = read_seqbegin(&sysctl_local_ports.lock);
+                *low = data[0];
+                *high = data[1];
+        } while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, int range[2])
+{
+        gid_t *data = table->data;
+        write_seqlock(&sysctl_local_ports.lock);
+        data[0] = range[0];
+        data[1] = range[1];
+        write_sequnlock(&sysctl_local_ports.lock);
+}
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(ctl_table *table, int write,
+                                 void __user *buffer,
+                                 size_t *lenp, loff_t *ppos)
+{
+        int ret;
+        gid_t range[2];
+        ctl_table tmp = {
+                .data = &range,
+                .maxlen = sizeof(range),
+                .mode = table->mode,
+                .extra1 = &ip_ping_group_range_min,
+                .extra2 = &ip_ping_group_range_max,
+        };
+        inet_get_ping_group_range_table(table, range, range + 1);
+        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+        if (write && ret == 0)
+                set_ping_group_range(table, range);
+        return ret;
+}
 static int proc_tcp_congestion_control(ctl_table *ctl, int write,
                                       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -153,8 +208,9 @@ static struct ctl_table ipv4_table[] = {
                .data           = &sysctl_ip_default_ttl,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = ipv4_doint_and_flush,
+                .proc_handler   = proc_dointvec_minmax,
-                .extra2         = &init_net,
+                .extra1         = &ip_ttl_min,
+                .extra2         = &ip_ttl_max,
        },
        {
                .procname       = "ip_no_pmtu_disc",
@@ -306,7 +362,6 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_do_large_bitmap,
        },
-#ifdef CONFIG_IP_MULTICAST
        {
                .procname       = "igmp_max_memberships",
                .data           = &sysctl_igmp_max_memberships,
@@ -314,8 +369,6 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
-#endif
        {
                .procname       = "igmp_max_msf",
                .data           = &sysctl_igmp_max_msf,
@@ -398,7 +451,7 @@ static struct ctl_table ipv4_table[] = {
                .data           = &sysctl_tcp_mem,
                .maxlen         = sizeof(sysctl_tcp_mem),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec
+                .proc_handler   = proc_doulongvec_minmax
        },
        {
                .procname       = "tcp_wmem",
@@ -426,7 +479,9 @@ static struct ctl_table ipv4_table[] = {
                .data           = &sysctl_tcp_adv_win_scale,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &tcp_adv_win_scale_min,
+                .extra2         = &tcp_adv_win_scale_max,
        },
        {
                .procname       = "tcp_tw_reuse",
@@ -602,8 +657,7 @@ static struct ctl_table ipv4_table[] = {
                .data           = &sysctl_udp_mem,
                .maxlen         = sizeof(sysctl_udp_mem),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
-                .extra1         = &zero
        },
        {
                .procname       = "udp_rmem_min",
@@ -674,6 +728,13 @@ static struct ctl_table ipv4_net_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+        {
+                .procname       = "ping_group_range",
+                .data           = &init_net.ipv4.sysctl_ping_group_range,
+                .maxlen         = sizeof(init_net.ipv4.sysctl_ping_group_range),
+                .mode           = 0644,
+                .proc_handler   = ipv4_ping_group_range,
+        },
        { }
 };
@@ -708,8 +769,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
                        &net->ipv4.sysctl_icmp_ratemask;
                table[6].data =
                        &net->ipv4.sysctl_rt_cache_rebuild_count;
+                table[7].data =
+                        &net->ipv4.sysctl_ping_group_range;
        }
+        /*
+         * Sane defaults - nobody may create ping sockets.
+         * Boot scripts should set this to distro-specific group.
+         */
+        net->ipv4.sysctl_ping_group_range[0] = 1;
+        net->ipv4.sysctl_ping_group_range[1] = 0;
        net->ipv4.sysctl_rt_cache_rebuild_count = 4;
        net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f115ea68a4ef..46febcacb729 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
-int sysctl_tcp_mem[3] __read_mostly;
+long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
-atomic_t tcp_memory_allocated;  /* Current allocated memory. */
+atomic_long_t tcp_memory_allocated;     /* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 /*
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
                else
                        answ = tp->write_seq - tp->snd_una;
                break;
+        case SIOCOUTQNSD:
+                if (sk->sk_state == TCP_LISTEN)
+                        return -EINVAL;
+                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+                        answ = 0;
+                else
+                        answ = tp->write_seq - tp->snd_nxt;
+                break;
        default:
                return -ENOIOCTLCMD;
        }
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
                                        flags);
        lock_sock(sk);
-        TCP_CHECK_TIMER(sk);
        res = do_tcp_sendpages(sk, &page, offset, size, flags);
-        TCP_CHECK_TIMER(sk);
        release_sock(sk);
        return res;
 }
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        long timeo;
        lock_sock(sk);
-        TCP_CHECK_TIMER(sk);
        flags = msg->msg_flags;
        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -993,7 +999,8 @@ new_segment:
                                /* We have some space in skb head. Superb! */
                                if (copy > skb_tailroom(skb))
                                        copy = skb_tailroom(skb);
-                                if ((err = skb_add_data(skb, from, copy)) != 0)
+                                err = skb_add_data_nocache(sk, skb, from, copy);
+                                if (err)
                                        goto do_fault;
                        } else {
                                int merge = 0;
@@ -1036,8 +1043,8 @@ new_segment:
                                /* Time to copy data. We are close to
                                 * the end! */
-                                err = skb_copy_to_page(sk, from, skb, page,
+                                err = skb_copy_to_page_nocache(sk, from, skb,
-                                                       off, copy);
+                                                               page, off, copy);
                                if (err) {
                                        /* If this page was new, give it to the
                                         * socket so it does not get leaked.
@@ -1104,7 +1111,6 @@ wait_for_memory:
 out:
        if (copied)
                tcp_push(sk, flags, mss_now, tp->nonagle);
-        TCP_CHECK_TIMER(sk);
        release_sock(sk);
        return copied;
@@ -1123,7 +1129,6 @@ do_error:
                goto out;
 out_err:
        err = sk_stream_error(sk, flags, err);
-        TCP_CHECK_TIMER(sk);
        release_sock(sk);
        return err;
 }
@@ -1193,7 +1198,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
        WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
-             KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+             "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
             tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
 #endif
@@ -1415,8 +1420,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        lock_sock(sk);
-        TCP_CHECK_TIMER(sk);
        err = -ENOTCONN;
        if (sk->sk_state == TCP_LISTEN)
                goto out;
@@ -1477,10 +1480,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                         * shouldn't happen.
                         */
                        if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
-                             KERN_INFO "recvmsg bug: copied %X "
+                                 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
-                                       "seq %X rcvnxt %X fl %X\n", *seq,
+                                 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
-                                       TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+                                 flags))
-                                       flags))
                                break;
                        offset = *seq - TCP_SKB_CB(skb)->seq;
@@ -1490,10 +1492,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                                goto found_ok_skb;
                        if (tcp_hdr(skb)->fin)
                                goto found_fin_ok;
-                        WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
+                        WARN(!(flags & MSG_PEEK),
-                                        "copied %X seq %X rcvnxt %X fl %X\n",
+                             "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
-                                        *seq, TCP_SKB_CB(skb)->seq,
+                             *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
-                                        tp->rcv_nxt, flags);
                }
                /* Well, if we have backlog, try to process it now yet. */
@@ -1769,12 +1770,10 @@ skip_copy:
        /* Clean up data we have read: This will do ACK frames. */
        tcp_cleanup_rbuf(sk, copied);
-        TCP_CHECK_TIMER(sk);
        release_sock(sk);
        return copied;
 out:
-        TCP_CHECK_TIMER(sk);
        release_sock(sk);
        return err;
@@ -2246,7 +2245,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                /* Values greater than interface MTU won't take effect. However
                 * at the point when this call is done we typically don't yet
                 * know which interface is going to be used */
-                if (val < 8 || val > MAX_TCP_WINDOW) {
+                if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
                        err = -EINVAL;
                        break;
                }
@@ -2392,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                err = tp->af_specific->md5_parse(sk, optval, optlen);
                break;
 #endif
+        case TCP_USER_TIMEOUT:
+                /* Cap the max timeout in ms TCP will retry/retrans
+                 * before giving up and aborting (ETIMEDOUT) a connection.
+                 */
+                icsk->icsk_user_timeout = msecs_to_jiffies(val);
+                break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -2611,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
        case TCP_THIN_DUPACK:
                val = tp->thin_dupack;
                break;
+        case TCP_USER_TIMEOUT:
+                val = jiffies_to_msecs(icsk->icsk_user_timeout);
+                break;
        default:
                return -ENOPROTOOPT;
        }
@@ -2646,7 +2654,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL(compat_tcp_getsockopt);
 #endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        struct tcphdr *th;
@@ -3212,7 +3220,7 @@ __setup("thash_entries=", set_thash_entries);
 void __init tcp_init(void)
 {
        struct sk_buff *skb = NULL;
-        unsigned long nr_pages, limit;
+        unsigned long limit;
        int i, max_share, cnt;
        unsigned long jiffy = jiffies;
@@ -3269,13 +3277,7 @@ void __init tcp_init(void)
        sysctl_tcp_max_orphans = cnt / 2;
        sysctl_max_syn_backlog = max(128, cnt / 256);
-        /* Set the pressure threshold to be a fraction of global memory that
+        limit = nr_free_buffer_pages() / 8;
-         * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
-         * memory, with a floor of 128 pages.
-         */
-        nr_pages = totalram_pages - totalhigh_pages;
-        limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
-        limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
        limit = max(limit, 128UL);
        sysctl_tcp_mem[0] = limit / 4 * 3;
        sysctl_tcp_mem[1] = limit;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23f..6187eb4d1dcf 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
 }
-static struct tcp_congestion_ops bictcp = {
+static struct tcp_congestion_ops bictcp __read_mostly = {
        .init           = bictcp_init,
        .ssthresh       = bictcp_recalc_ssthresh,
        .cong_avoid     = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 71d5f2f29fa6..f376b05cca81 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -39,7 +39,7 @@
 /* Number of delay samples for detecting the increase of delay */
 #define HYSTART_MIN_SAMPLES     8
-#define HYSTART_DELAY_MIN       (2U<<3)
+#define HYSTART_DELAY_MIN       (4U<<3)
 #define HYSTART_DELAY_MAX       (16U<<3)
 #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
@@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1;
 static int hystart __read_mostly = 1;
 static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
 static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta __read_mostly = 2;
 static u32 cube_rtt_scale __read_mostly;
 static u32 beta_scale __read_mostly;
@@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
                 " 1: packet-train 2: delay 3: both packet-train and delay");
 module_param(hystart_low_window, int, 0644);
 MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
 /* BIC TCP Parameters */
 struct bictcp {
@@ -85,17 +88,18 @@ struct bictcp {
        u32     last_time;      /* time when updated last_cwnd */
        u32     bic_origin_point;/* origin point of bic function */
        u32     bic_K;          /* time to origin point from the beginning of the current epoch */
-        u32     delay_min;      /* min delay */
+        u32     delay_min;      /* min delay (msec << 3) */
        u32     epoch_start;    /* beginning of an epoch */
        u32     ack_cnt;        /* number of acks */
        u32     tcp_cwnd;       /* estimated tcp cwnd */
 #define ACK_RATIO_SHIFT 4
+#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
        u16     delayed_ack;    /* estimate the ratio of Packets/ACKs << 4 */
        u8      sample_cnt;     /* number of samples to decide curr_rtt */
        u8      found;          /* the exit point is found? */
        u32     round_start;    /* beginning of each round */
        u32     end_seq;        /* end_seq of the round */
-        u32     last_jiffies;   /* last time when the ACK spacing is close */
+        u32     last_ack;       /* last time when the ACK spacing is close */
        u32     curr_rtt;       /* the minimum rtt of current round */
 };
@@ -116,12 +120,21 @@ static inline void bictcp_reset(struct bictcp *ca)
        ca->found = 0;
 }
+static inline u32 bictcp_clock(void)
+{
+#if HZ < 1000
+        return ktime_to_ms(ktime_get_real());
+#else
+        return jiffies_to_msecs(jiffies);
+#endif
+}
 static inline void bictcp_hystart_reset(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct bictcp *ca = inet_csk_ca(sk);
-        ca->round_start = ca->last_jiffies = jiffies;
+        ca->round_start = ca->last_ack = bictcp_clock();
        ca->end_seq = tp->snd_nxt;
        ca->curr_rtt = 0;
        ca->sample_cnt = 0;
@@ -236,8 +249,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
         */
        /* change the unit from HZ to bictcp_HZ */
-        t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start)
+        t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
-             << BICTCP_HZ) / HZ;
+              - ca->epoch_start) << BICTCP_HZ) / HZ;
        if (t < ca->bic_K)              /* t - K */
                offs = ca->bic_K - t;
@@ -258,6 +271,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
                ca->cnt = 100 * cwnd;              /* very small increment*/
        }
+        /*
+         * The initial growth of cubic function may be too conservative
+         * when the available bandwidth is still unknown.
+         */
+        if (ca->loss_cwnd == 0 && ca->cnt > 20)
+                ca->cnt = 20;   /* increase cwnd 5% per RTT */
        /* TCP Friendly */
        if (tcp_friendliness) {
                u32 scale = beta_scale;
@@ -339,12 +359,12 @@ static void hystart_update(struct sock *sk, u32 delay)
        struct bictcp *ca = inet_csk_ca(sk);
        if (!(ca->found & hystart_detect)) {
-                u32 curr_jiffies = jiffies;
+                u32 now = bictcp_clock();
                /* first detection parameter - ack-train detection */
-                if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) {
+                if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
-                        ca->last_jiffies = curr_jiffies;
+                        ca->last_ack = now;
-                        if (curr_jiffies - ca->round_start >= ca->delay_min>>4)
+                        if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
                                ca->found |= HYSTART_ACK_TRAIN;
                }
@@ -379,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
        u32 delay;
        if (icsk->icsk_ca_state == TCP_CA_Open) {
-                cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+                u32 ratio = ca->delayed_ack;
-                ca->delayed_ack += cnt;
+                ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+                ratio += cnt;
+                ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
        }
        /* Some calls are for duplicates without timetamps */
@@ -391,7 +415,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
        if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
                return;
-        delay = usecs_to_jiffies(rtt_us) << 3;
+        delay = (rtt_us << 3) / USEC_PER_MSEC;
        if (delay == 0)
                delay = 1;
@@ -405,7 +429,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
                hystart_update(sk, delay);
 }
-static struct tcp_congestion_ops cubictcp = {
+static struct tcp_congestion_ops cubictcp __read_mostly = {
        .init           = bictcp_init,
        .ssthresh       = bictcp_recalc_ssthresh,
        .cong_avoid     = bictcp_cong_avoid,
@@ -447,6 +471,10 @@ static int __init cubictcp_register(void)
        /* divide by bic_scale and by constant Srtt (100ms) */
        do_div(cube_factor, bic_scale * 10);
+        /* hystart needs ms clock resolution */
+        if (hystart && HZ < 1000)
+                cubictcp.flags |= TCP_CONG_RTT_STAMP;
        return tcp_register_congestion_control(&cubictcp);
 }
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb9..30f27f6b3655 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
 }
-static struct tcp_congestion_ops tcp_highspeed = {
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
        .init           = hstcp_init,
        .ssthresh       = hstcp_ssthresh,
        .cong_avoid     = hstcp_cong_avoid,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a4955416..c1a8175361e8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
        }
 }
-static struct tcp_congestion_ops htcp = {
+static struct tcp_congestion_ops htcp __read_mostly = {
        .init           = htcp_init,
        .ssthresh       = htcp_recalc_ssthresh,
        .cong_avoid     = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc9349371..fe3ecf484b44 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
 }
-static struct tcp_congestion_ops tcp_hybla = {
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
        .init           = hybla_init,
        .ssthresh       = tcp_reno_ssthresh,
        .min_cwnd       = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72dc..813b43a76fec 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
 * The algorithm is described in:
 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
 *  for High-Speed Networks"
- * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
 *
 * Implemented from description in paper and ns-2 simulation.
 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
        }
 }
-static struct tcp_congestion_ops tcp_illinois = {
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_illinois_init,
        .ssthresh       = tcp_illinois_ssthresh,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b55f60f6fcbe..bef9f04c22ba 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
                icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        tcp_incr_quickack(sk);
@@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
        int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
                     sizeof(struct sk_buff);
-        if (sk->sk_sndbuf < 3 * sndmem)
+        if (sk->sk_sndbuf < 3 * sndmem) {
-                sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+                sk->sk_sndbuf = 3 * sndmem;
+                if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
+                        sk->sk_sndbuf = sysctl_tcp_wmem[2];
+        }
 }
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
        if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
            !tcp_memory_pressure &&
-            atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+            atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
                                    sysctl_tcp_rmem[2]);
        }
@@ -428,10 +431,10 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
 *
 * More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
 * though this reference is out of date.  A new paper
 * is pending.
 */
@@ -731,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
                         * Reset our results.
                         */
                        if (!(dst_metric_locked(dst, RTAX_RTT)))
-                                dst->metrics[RTAX_RTT - 1] = 0;
+                                dst_metric_set(dst, RTAX_RTT, 0);
                        return;
                }
@@ -773,57 +776,48 @@ void tcp_update_metrics(struct sock *sk)
                        if (dst_metric(dst, RTAX_SSTHRESH) &&
                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
                            (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-                                dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
+                                dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
                        if (!dst_metric_locked(dst, RTAX_CWND) &&
                            tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-                                dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd;
+                                dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
                } else if (tp->snd_cwnd > tp->snd_ssthresh &&
                           icsk->icsk_ca_state == TCP_CA_Open) {
                        /* Cong. avoidance phase, cwnd is reliable. */
                        if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-                                dst->metrics[RTAX_SSTHRESH-1] =
+                                dst_metric_set(dst, RTAX_SSTHRESH,
-                                        max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+                                               max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
                        if (!dst_metric_locked(dst, RTAX_CWND))
-                                dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
+                                dst_metric_set(dst, RTAX_CWND,
+                                               (dst_metric(dst, RTAX_CWND) +
+                                                tp->snd_cwnd) >> 1);
                } else {
                        /* Else slow start did not finish, cwnd is non-sense,
                           ssthresh may be also invalid.
                         */
                        if (!dst_metric_locked(dst, RTAX_CWND))
-                                dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
+                                dst_metric_set(dst, RTAX_CWND,
+                                               (dst_metric(dst, RTAX_CWND) +
+                                                tp->snd_ssthresh) >> 1);
                        if (dst_metric(dst, RTAX_SSTHRESH) &&
                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
                            tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-                                dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+                                dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
                }
                if (!dst_metric_locked(dst, RTAX_REORDERING)) {
                        if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
                            tp->reordering != sysctl_tcp_reordering)
-                                dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+                                dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
                }
        }
 }
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- *      The RFC specifies a window of no more than 4380 bytes
- *      unless 2*MSS > 4380.  Reading the pseudocode in the RFC
- *      is a bit misleading because they use a clamp at 4380 bytes
- *      rather than use a multiplier in the relevant range.
- */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
-        if (!cwnd) {
+        if (!cwnd)
-                if (tp->mss_cache > 1460)
+                cwnd = TCP_INIT_CWND;
-                        cwnd = 2;
-                else
-                        cwnd = (tp->mss_cache > 1095) ? 3 : 4;
-        }
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -922,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
        }
        tcp_set_rto(sk);
-        if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+        if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
-                goto reset;
-cwnd:
-        tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-        tp->snd_cwnd_stamp = tcp_time_stamp;
-        return;
 reset:
-        /* Play conservative. If timestamps are not
+                /* Play conservative. If timestamps are not
-         * supported, TCP will fail to recalculate correct
+                 * supported, TCP will fail to recalculate correct
-         * rtt, if initial rto is too small. FORGET ALL AND RESET!
+                 * rtt, if initial rto is too small. FORGET ALL AND RESET!
-         */
+                 */
-        if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+                if (!tp->rx_opt.saw_tstamp && tp->srtt) {
-                tp->srtt = 0;
+                        tp->srtt = 0;
-                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+                        tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+                        inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+                }
        }
-        goto cwnd;
+        tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -1233,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
        }
        /* D-SACK for already forgotten data... Do dumb counting. */
-        if (dup_sack &&
+        if (dup_sack && tp->undo_marker && tp->undo_retrans &&
            !after(end_seq_0, prior_snd_una) &&
            after(end_seq_0, tp->undo_marker))
                tp->undo_retrans--;
@@ -1310,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
        /* Account D-SACK for retransmitted packet. */
        if (dup_sack && (sacked & TCPCB_RETRANS)) {
-                if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+                if (tp->undo_marker && tp->undo_retrans &&
+                    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
                        tp->undo_retrans--;
                if (sacked & TCPCB_SACKED_ACKED)
                        state->reord = min(fack_count, state->reord);
@@ -2314,7 +2304,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
 static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-        return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+        return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
 }
 static inline int tcp_head_timedout(struct sock *sk)
@@ -2508,7 +2498,7 @@ static void tcp_timeout_skbs(struct sock *sk)
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
 * is against sacked "cnt", otherwise it's against facked "cnt"
 */
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -2516,13 +2506,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
        int err;
        unsigned int mss;
-        if (packets == 0)
-                return;
        WARN_ON(packets > tp->packets_out);
        if (tp->lost_skb_hint) {
                skb = tp->lost_skb_hint;
                cnt = tp->lost_cnt_hint;
+                /* Head already handled? */
+                if (mark_head && skb != tcp_write_queue_head(sk))
+                        return;
        } else {
                skb = tcp_write_queue_head(sk);
                cnt = 0;
@@ -2557,6 +2547,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
                }
                tcp_skb_mark_lost(tp, skb);
+                if (mark_head)
+                        break;
        }
        tcp_verify_left_out(tp);
 }
@@ -2568,17 +2561,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
        struct tcp_sock *tp = tcp_sk(sk);
        if (tcp_is_reno(tp)) {
-                tcp_mark_head_lost(sk, 1);
+                tcp_mark_head_lost(sk, 1, 1);
        } else if (tcp_is_fack(tp)) {
                int lost = tp->fackets_out - tp->reordering;
                if (lost <= 0)
                        lost = 1;
-                tcp_mark_head_lost(sk, lost);
+                tcp_mark_head_lost(sk, lost, 0);
        } else {
                int sacked_upto = tp->sacked_out - tp->reordering;
-                if (sacked_upto < fast_rexmit)
+                if (sacked_upto >= 0)
-                        sacked_upto = fast_rexmit;
+                        tcp_mark_head_lost(sk, sacked_upto, 0);
-                tcp_mark_head_lost(sk, sacked_upto);
+                else if (fast_rexmit)
+                        tcp_mark_head_lost(sk, 1, 1);
        }
        tcp_timeout_skbs(sk);
@@ -2665,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 #define DBGUNDO(x...) do { } while (0)
 #endif
-static void tcp_undo_cwr(struct sock *sk, const int undo)
+static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2677,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
                else
                        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
-                if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
+                if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
                        tp->snd_ssthresh = tp->prior_ssthresh;
                        TCP_ECN_withdraw_cwr(tp);
                }
        } else {
                tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
        }
-        tcp_moderate_cwnd(tp);
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
@@ -2705,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk)
                 * or our original transmission succeeded.
                 */
                DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
-                tcp_undo_cwr(sk, 1);
+                tcp_undo_cwr(sk, true);
                if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
                        mib_idx = LINUX_MIB_TCPLOSSUNDO;
                else
@@ -2732,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk)
        if (tp->undo_marker && !tp->undo_retrans) {
                DBGUNDO(sk, "D-SACK");
-                tcp_undo_cwr(sk, 1);
+                tcp_undo_cwr(sk, true);
                tp->undo_marker = 0;
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
        }
@@ -2785,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
                tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
                DBGUNDO(sk, "Hoe");
-                tcp_undo_cwr(sk, 0);
+                tcp_undo_cwr(sk, false);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
                /* So... Do not make Hoe's retransmit yet.
@@ -2814,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk)
                DBGUNDO(sk, "partial loss");
                tp->lost_out = 0;
-                tcp_undo_cwr(sk, 1);
+                tcp_undo_cwr(sk, true);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
                inet_csk(sk)->icsk_retransmits = 0;
                tp->undo_marker = 0;
@@ -2828,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk)
 static inline void tcp_complete_cwr(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+        /* Do not moderate cwnd if it's already undone in cwr or recovery */
-        tp->snd_cwnd_stamp = tcp_time_stamp;
+        if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
+                tp->snd_cwnd = tp->snd_ssthresh;
+                tp->snd_cwnd_stamp = tcp_time_stamp;
+        }
        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
@@ -2887,7 +2883,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
                       icsk->icsk_mtup.probe_size;
        tp->snd_cwnd_cnt = 0;
        tp->snd_cwnd_stamp = tcp_time_stamp;
-        tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+        tp->snd_ssthresh = tcp_current_ssthresh(sk);
        icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
        icsk->icsk_mtup.probe_size = 0;
@@ -2984,7 +2980,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
            before(tp->snd_una, tp->high_seq) &&
            icsk->icsk_ca_state != TCP_CA_Open &&
            tp->fackets_out > tp->reordering) {
-                tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+                tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
        }
@@ -3356,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                                 net_invalid_timestamp()))
                                        rtt_us = ktime_us_delta(ktime_get_real(),
                                                                last_ackt);
-                                else if (ca_seq_rtt > 0)
+                                else if (ca_seq_rtt >= 0)
                                        rtt_us = jiffies_to_usecs(ca_seq_rtt);
                        }
@@ -3412,8 +3408,8 @@ static void tcp_ack_probe(struct sock *sk)
 static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
-        return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+        return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-                inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+                inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
 static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3430,9 +3426,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
                                        const u32 ack, const u32 ack_seq,
                                        const u32 nwin)
 {
-        return (after(ack, tp->snd_una) ||
+        return  after(ack, tp->snd_una) ||
                after(ack_seq, tp->snd_wl1) ||
-                (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+                (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
 }
 /* Update our send window.
@@ -3500,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
        if (flag & FLAG_ECE)
                tcp_ratehalving_spur_to_response(sk);
        else
-                tcp_undo_cwr(sk, 1);
+                tcp_undo_cwr(sk, true);
 }
 /* F-RTO spurious RTO detection algorithm (RFC4138)
@@ -4406,7 +4402,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                        if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
                                tp->ucopy.len -= chunk;
                                tp->copied_seq += chunk;
-                                eaten = (chunk == skb->len && !th->fin);
+                                eaten = (chunk == skb->len);
                                tcp_rcv_space_adjust(sk);
                        }
                        local_bh_disable();
@@ -4870,7 +4866,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
                return 0;
        /* If we are under soft global TCP memory pressure, do not expand.  */
-        if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+        if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
                return 0;
        /* If we filled the congestion window, do not expand.  */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb0..708dc203b034 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -146,13 +146,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 /* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
+        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
-        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+        __be16 orig_sport, orig_dport;
-        struct rtable *rt;
        __be32 daddr, nexthop;
-        int tmp;
+        struct flowi4 *fl4;
+        struct rtable *rt;
        int err;
+        struct ip_options_rcu *inet_opt;
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;
@@ -161,20 +163,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                return -EAFNOSUPPORT;
        nexthop = daddr = usin->sin_addr.s_addr;
-        if (inet->opt && inet->opt->srr) {
+        inet_opt = rcu_dereference_protected(inet->inet_opt,
+                                             sock_owned_by_user(sk));
+        if (inet_opt && inet_opt->opt.srr) {
                if (!daddr)
                        return -EINVAL;
-                nexthop = inet->opt->faddr;
+                nexthop = inet_opt->opt.faddr;
-        }
+        }
-        tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
+        orig_sport = inet->inet_sport;
-                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+        orig_dport = usin->sin_port;
-                               IPPROTO_TCP,
+        fl4 = &inet->cork.fl.u.ip4;
-                               inet->inet_sport, usin->sin_port, sk, 1);
+        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
-        if (tmp < 0) {
+                              RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
-                if (tmp == -ENETUNREACH)
+                              IPPROTO_TCP,
+                              orig_sport, orig_dport, sk, true);
+        if (IS_ERR(rt)) {
+                err = PTR_ERR(rt);
+                if (err == -ENETUNREACH)
                        IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
-                return tmp;
+                return err;
        }
        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -182,11 +190,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                return -ENETUNREACH;
        }
-        if (!inet->opt || !inet->opt->srr)
+        if (!inet_opt || !inet_opt->opt.srr)
-                daddr = rt->rt_dst;
+                daddr = fl4->daddr;
        if (!inet->inet_saddr)
-                inet->inet_saddr = rt->rt_src;
+                inet->inet_saddr = fl4->saddr;
        inet->inet_rcv_saddr = inet->inet_saddr;
        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
@@ -197,8 +205,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        }
        if (tcp_death_row.sysctl_tw_recycle &&
-            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
+            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
-                struct inet_peer *peer = rt_get_peer(rt);
+                struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
                /*
                 * VJ's idea. We save last timestamp seen from
                 * the destination in peer table, when entering state
@@ -218,8 +226,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        inet->inet_daddr = daddr;
        inet_csk(sk)->icsk_ext_hdr_len = 0;
-        if (inet->opt)
+        if (inet_opt)
-                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
@@ -233,11 +241,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        if (err)
                goto failure;
-        err = ip_route_newports(&rt, IPPROTO_TCP,
+        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
-                                inet->inet_sport, inet->inet_dport, sk);
+                               inet->inet_sport, inet->inet_dport, sk);
-        if (err)
+        if (IS_ERR(rt)) {
+                err = PTR_ERR(rt);
+                rt = NULL;
                goto failure;
+        }
        /* OK, now commit destination to socket.  */
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->dst);
@@ -273,7 +283,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
 /*
 * This routine does path mtu discovery as defined in RFC1191.
 */
-static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
+static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 {
        struct dst_entry *dst;
        struct inet_sock *inet = inet_sk(sk);
@@ -335,7 +345,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 {
-        struct iphdr *iph = (struct iphdr *)icmp_skb->data;
+        const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
        struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
        struct inet_connection_sock *icsk;
        struct tcp_sock *tp;
@@ -415,6 +425,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                    !icsk->icsk_backoff)
                        break;
+                if (sock_owned_by_user(sk))
+                        break;
                icsk->icsk_backoff--;
                inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
                                         icsk->icsk_backoff;
@@ -429,11 +442,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                if (remaining) {
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                  remaining, TCP_RTO_MAX);
-                } else if (sock_owned_by_user(sk)) {
-                        /* RTO revert clocked out retransmission,
-                         * but socket is locked. Will defer. */
-                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                                  HZ/20, TCP_RTO_MAX);
                } else {
                        /* RTO revert clocked out retransmission.
                         * Will retransmit now */
@@ -643,7 +651,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
        net = dev_net(skb_dst(skb)->dev);
-        ip_send_reply(net->ipv4.tcp_sock, skb,
+        ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
                      &arg, arg.iov[0].iov_len);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -718,7 +726,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
        if (oif)
                arg.bound_dev_if = oif;
-        ip_send_reply(net->ipv4.tcp_sock, skb,
+        ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
                      &arg, arg.iov[0].iov_len);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -761,11 +769,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                              struct request_values *rvp)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
+        struct flowi4 fl4;
        int err = -1;
        struct sk_buff * skb;
        /* First, grab a route. */
-        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
+        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;
        skb = tcp_make_synack(sk, dst, req, rvp);
@@ -816,17 +825,18 @@ static void syn_flood_warning(const struct sk_buff *skb)
 /*
 * Save and compile IPv4 options into the request_sock if needed.
 */
-static struct ip_options *tcp_v4_save_options(struct sock *sk,
+static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
-                                              struct sk_buff *skb)
+                                                  struct sk_buff *skb)
 {
-        struct ip_options *opt = &(IPCB(skb)->opt);
+        const struct ip_options *opt = &(IPCB(skb)->opt);
-        struct ip_options *dopt = NULL;
+        struct ip_options_rcu *dopt = NULL;
        if (opt && opt->optlen) {
-                int opt_size = optlength(opt);
+                int opt_size = sizeof(*dopt) + opt->optlen;
                dopt = kmalloc(opt_size, GFP_ATOMIC);
                if (dopt) {
-                        if (ip_options_echo(dopt, skb)) {
+                        if (ip_options_echo(&dopt->opt, skb)) {
                                kfree(dopt);
                                dopt = NULL;
                        }
@@ -1212,12 +1222,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 };
 #endif
-static struct timewait_sock_ops tcp_timewait_sock_ops = {
-        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
-        .twsk_unique    = tcp_twsk_unique,
-        .twsk_destructor= tcp_twsk_destructor,
-};
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_extend_values tmp_ext;
@@ -1335,6 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
                struct inet_peer *peer = NULL;
+                struct flowi4 fl4;
                /* VJ's idea. We save last timestamp seen
                 * from the destination in peer table, when entering
@@ -1347,9 +1352,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                 */
                if (tmp_opt.saw_tstamp &&
                    tcp_death_row.sysctl_tw_recycle &&
-                    (dst = inet_csk_route_req(sk, req)) != NULL &&
+                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
-                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
+                    fl4.daddr == saddr &&
-                    peer->v4daddr == saddr) {
+                    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
                        inet_peer_refcheck(peer);
                        if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
                            (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1413,19 +1418,16 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 #ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key;
 #endif
+        struct ip_options_rcu *inet_opt;
        if (sk_acceptq_is_full(sk))
                goto exit_overflow;
-        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
-                goto exit;
        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
-                goto exit;
+                goto exit_nonewsk;
        newsk->sk_gso_type = SKB_GSO_TCPV4;
-        sk_setup_caps(newsk, dst);
        newtp                 = tcp_sk(newsk);
        newinet               = inet_sk(newsk);
@@ -1433,18 +1435,24 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        newinet->inet_daddr   = ireq->rmt_addr;
        newinet->inet_rcv_saddr = ireq->loc_addr;
        newinet->inet_saddr           = ireq->loc_addr;
-        newinet->opt          = ireq->opt;
+        inet_opt              = ireq->opt;
+        rcu_assign_pointer(newinet->inet_opt, inet_opt);
        ireq->opt             = NULL;
        newinet->mc_index     = inet_iif(skb);
        newinet->mc_ttl       = ip_hdr(skb)->ttl;
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
-        if (newinet->opt)
+        if (inet_opt)
-                inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
+                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        newinet->inet_id = newtp->write_seq ^ jiffies;
+        if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
+                goto put_and_exit;
+        sk_setup_caps(newsk, dst);
        tcp_mtup_init(newsk);
        tcp_sync_mss(newsk, dst_mtu(dst));
-        newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+        newtp->advmss = dst_metric_advmss(dst);
        if (tcp_sk(sk)->rx_opt.user_mss &&
            tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
@@ -1469,17 +1477,22 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        }
 #endif
+        if (__inet_inherit_port(sk, newsk) < 0)
+                goto put_and_exit;
        __inet_hash_nolisten(newsk, NULL);
-        __inet_inherit_port(sk, newsk);
        return newsk;
 exit_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+        dst_release(dst);
 exit:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-        dst_release(dst);
        return NULL;
+put_and_exit:
+        sock_put(newsk);
+        goto exit;
 }
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -1560,12 +1573,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                sock_rps_save_rxhash(sk, skb->rxhash);
-                TCP_CHECK_TIMER(sk);
                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
                        rsk = sk;
                        goto reset;
                }
-                TCP_CHECK_TIMER(sk);
                return 0;
        }
@@ -1578,6 +1589,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                        goto discard;
                if (nsk != sk) {
+                        sock_rps_save_rxhash(nsk, skb->rxhash);
                        if (tcp_child_process(sk, nsk, skb)) {
                                rsk = nsk;
                                goto reset;
@@ -1587,13 +1599,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
        } else
                sock_rps_save_rxhash(sk, skb->rxhash);
-        TCP_CHECK_TIMER(sk);
        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
                rsk = sk;
                goto reset;
        }
-        TCP_CHECK_TIMER(sk);
        return 0;
 reset:
@@ -1761,64 +1770,41 @@ do_time_wait:
        goto discard_it;
 }
-/* VJ's idea. Save last timestamp seen from this destination
+struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-int tcp_v4_remember_stamp(struct sock *sk)
 {
+        struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
        struct inet_sock *inet = inet_sk(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_peer *peer;
-        struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
-        struct inet_peer *peer = NULL;
-        int release_it = 0;
-        if (!rt || rt->rt_dst != inet->inet_daddr) {
+        if (!rt ||
-                peer = inet_getpeer(inet->inet_daddr, 1);
+            inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
-                release_it = 1;
+                peer = inet_getpeer_v4(inet->inet_daddr, 1);
+                *release_it = true;
        } else {
                if (!rt->peer)
-                        rt_bind_peer(rt, 1);
+                        rt_bind_peer(rt, inet->inet_daddr, 1);
                peer = rt->peer;
+                *release_it = false;
        }
-        if (peer) {
+        return peer;
-                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
-                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-                     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
-                        peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
-                        peer->tcp_ts = tp->rx_opt.ts_recent;
-                }
-                if (release_it)
-                        inet_putpeer(peer);
-                return 1;
-        }
-        return 0;
 }
-EXPORT_SYMBOL(tcp_v4_remember_stamp);
+EXPORT_SYMBOL(tcp_v4_get_peer);
-int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
+void *tcp_v4_tw_get_peer(struct sock *sk)
 {
-        struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
+        struct inet_timewait_sock *tw = inet_twsk(sk);
-        if (peer) {
-                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
-                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
-                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-                     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
-                        peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
-                        peer->tcp_ts       = tcptw->tw_ts_recent;
-                }
-                inet_putpeer(peer);
-                return 1;
-        }
-        return 0;
+        return inet_getpeer_v4(tw->tw_daddr, 1);
 }
+EXPORT_SYMBOL(tcp_v4_tw_get_peer);
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
+        .twsk_unique    = tcp_twsk_unique,
+        .twsk_destructor= tcp_twsk_destructor,
+        .twsk_getpeer   = tcp_v4_tw_get_peer,
+};
 const struct inet_connection_sock_af_ops ipv4_specific = {
        .queue_xmit        = ip_queue_xmit,
@@ -1826,7 +1812,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
        .rebuild_header    = inet_sk_rebuild_header,
        .conn_request      = tcp_v4_conn_request,
        .syn_recv_sock     = tcp_v4_syn_recv_sock,
-        .remember_stamp    = tcp_v4_remember_stamp,
+        .get_peer          = tcp_v4_get_peer,
        .net_header_len    = sizeof(struct iphdr),
        .setsockopt        = ip_setsockopt,
        .getsockopt        = ip_getsockopt,
@@ -2022,13 +2008,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
                                }
                                req = req->dl_next;
                        }
-                        st->offset = 0;
                        if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
                                break;
 get_req:
                        req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
                }
-                sk        = sk_next(st->syn_wait_sk);
+                sk        = sk_nulls_next(st->syn_wait_sk);
                st->state = TCP_SEQ_STATE_LISTENING;
                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
        } else {
@@ -2037,11 +2022,13 @@ get_req:
                if (reqsk_queue_len(&icsk->icsk_accept_queue))
                        goto start_req;
                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
-                sk = sk_next(sk);
+                sk = sk_nulls_next(sk);
        }
 get_sk:
        sk_nulls_for_each_from(sk, node) {
-                if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
+                if (!net_eq(sock_net(sk), net))
+                        continue;
+                if (sk->sk_family == st->family) {
                        cur = sk;
                        goto out;
                }
@@ -2385,7 +2372,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
        int ttd = req->expires - jiffies;
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
                i,
                ireq->loc_addr,
                ntohs(inet_sk(sk)->inet_sport),
@@ -2440,7 +2427,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
-                        "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
+                        "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
                i, src, srcp, dest, destp, sk->sk_state,
                tp->write_seq - tp->snd_una,
                rx_queue,
@@ -2475,7 +2462,7 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,
        srcp  = ntohs(tw->tw_sport);
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
                atomic_read(&tw->tw_refcnt), tw, len);
@@ -2553,7 +2540,7 @@ void tcp4_proc_exit(void)
 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
-        struct iphdr *iph = skb_gro_network_header(skb);
+        const struct iphdr *iph = skb_gro_network_header(skb);
        switch (skb->ip_summed) {
        case CHECKSUM_COMPLETE:
@@ -2571,11 +2558,10 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
        return tcp_gro_receive(head, skb);
 }
-EXPORT_SYMBOL(tcp4_gro_receive);
 int tcp4_gro_complete(struct sk_buff *skb)
 {
-        struct iphdr *iph = ip_hdr(skb);
+        const struct iphdr *iph = ip_hdr(skb);
        struct tcphdr *th = tcp_hdr(skb);
        th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
@@ -2584,7 +2570,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
        return tcp_gro_complete(skb);
 }
-EXPORT_SYMBOL(tcp4_gro_complete);
 struct proto tcp_prot = {
        .name                   = "TCP",
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbba..72f7218b03f5 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -12,7 +12,7 @@
 *     within cong_avoid.
 *   o Error correcting in remote HZ, therefore remote HZ will be keeped
 *     on checking and updating.
- *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne
+ *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
 *     OWD have a similar meaning as RTT. Also correct the buggy formular.
 *   o Handle reaction for Early Congestion Indication (ECI) within
 *     pkts_acked, as mentioned within pseudo code.
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
        lp->last_drop = tcp_time_stamp;
 }
-static struct tcp_congestion_ops tcp_lp = {
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
        .flags = TCP_CONG_RTT_STAMP,
        .init = tcp_lp_init,
        .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85cb..80b1f80759ab 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,13 +49,63 @@ struct inet_timewait_death_row tcp_death_row = {
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+static int tcp_remember_stamp(struct sock *sk)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_peer *peer;
+        bool release_it;
+        peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
+        if (peer) {
+                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
+                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+                     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+                        peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+                        peer->tcp_ts = tp->rx_opt.ts_recent;
+                }
+                if (release_it)
+                        inet_putpeer(peer);
+                return 1;
+        }
+        return 0;
+}
+static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+        struct sock *sk = (struct sock *) tw;
+        struct inet_peer *peer;
+        peer = twsk_getpeer(sk);
+        if (peer) {
+                const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
+                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+                     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+                        peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+                        peer->tcp_ts       = tcptw->tw_ts_recent;
+                }
+                inet_putpeer(peer);
+                return 1;
+        }
+        return 0;
+}
 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
        if (seq == s_win)
                return 1;
        if (after(end_seq, s_win) && before(seq, e_win))
                return 1;
-        return (seq == e_win && seq == end_seq);
+        return seq == e_win && seq == end_seq;
 }
 /*
@@ -149,14 +199,9 @@ kill_with_rst:
                        tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
                }
-                /* I am shamed, but failed to make it more elegant.
+                if (tcp_death_row.sysctl_tw_recycle &&
-                 * Yes, it is direct reference to IP, which is impossible
+                    tcptw->tw_ts_recent_stamp &&
-                 * to generalize to IPv6. Taking into account that IPv6
+                    tcp_tw_remember_stamp(tw))
-                 * do not understand recycling in any case, it not
-                 * a big problem in practice. --ANK */
-                if (tw->tw_family == AF_INET &&
-                    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
-                    tcp_v4_tw_remember_stamp(tw))
                        inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
                                           TCP_TIMEWAIT_LEN);
                else
@@ -274,7 +319,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        int recycle_ok = 0;
        if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
-                recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
+                recycle_ok = tcp_remember_stamp(sk);
        if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
                tw = inet_twsk_alloc(sk, state);
@@ -347,7 +392,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                 * socket up.  We've got bigger problems than
                 * non-graceful socket closings.
                 */
-                LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
        }
        tcp_update_metrics(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd8458588..882e0b0964d0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -55,7 +55,7 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
 int sysctl_tcp_tso_win_divisor __read_mostly = 3;
 int sysctl_tcp_mtu_probing __read_mostly = 0;
-int sysctl_tcp_base_mss __read_mostly = 512;
+int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
@@ -73,7 +73,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
        tcp_advance_send_head(sk, skb);
        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-        /* Don't override Nagle indefinately with F-RTO */
+        /* Don't override Nagle indefinitely with F-RTO */
        if (tp->frto_counter == 2)
                tp->frto_counter = 3;
@@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk)
        struct dst_entry *dst = __sk_dst_get(sk);
        int mss = tp->advmss;
-        if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
+        if (dst) {
-                mss = dst_metric(dst, RTAX_ADVMSS);
+                unsigned int metric = dst_metric_advmss(dst);
-                tp->advmss = mss;
+                if (metric < mss) {
+                        mss = metric;
+                        tp->advmss = mss;
+                }
        }
        return (__u16)mss;
@@ -224,24 +228,22 @@ void tcp_select_initial_window(int __space, __u32 mss,
                }
        }
-        /* Set initial window to value enough for senders,
+        /* Set initial window to a value enough for senders starting with
-         * following RFC2414. Senders, not following this RFC,
+         * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
-         * will be satisfied with 2.
+         * a limit on the initial window when mss is larger than 1460.
         */
        if (mss > (1 << *rcv_wscale)) {
-                int init_cwnd = 4;
+                int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
-                if (mss > 1460 * 3)
+                if (mss > 1460)
-                        init_cwnd = 2;
+                        init_cwnd =
-                else if (mss > 1460)
+                        max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
-                        init_cwnd = 3;
                /* when initializing use the value from init_rcv_wnd
                 * rather than the default from above
                 */
-                if (init_rcv_wnd &&
+                if (init_rcv_wnd)
-                    (*rcv_wnd > init_rcv_wnd * mss))
+                        *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
-                        *rcv_wnd = init_rcv_wnd * mss;
+                else
-                else if (*rcv_wnd > init_cwnd * mss)
+                        *rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
-                        *rcv_wnd = init_cwnd * mss;
        }
        /* Set the clamp no higher than max representable value */
@@ -392,27 +394,30 @@ struct tcp_out_options {
 */
 static u8 tcp_cookie_size_check(u8 desired)
 {
-        if (desired > 0) {
+        int cookie_size;
+        if (desired > 0)
                /* previously specified */
                return desired;
-        }
-        if (sysctl_tcp_cookie_size <= 0) {
+        cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
+        if (cookie_size <= 0)
                /* no default specified */
                return 0;
-        }
-        if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
+        if (cookie_size <= TCP_COOKIE_MIN)
                /* value too small, specify minimum */
                return TCP_COOKIE_MIN;
-        }
-        if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
+        if (cookie_size >= TCP_COOKIE_MAX)
                /* value too large, specify maximum */
                return TCP_COOKIE_MAX;
-        }
-        if (0x1 & sysctl_tcp_cookie_size) {
+        if (cookie_size & 1)
                /* 8-bit multiple, illegal, fix it */
-                return (u8)(sysctl_tcp_cookie_size + 0x1);
+                cookie_size++;
-        }
-        return (u8)sysctl_tcp_cookie_size;
+        return (u8)cookie_size;
 }
 /* Write previously computed TCP options to the packet.
@@ -828,8 +833,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                                                           &md5);
        tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
-        if (tcp_packets_in_flight(tp) == 0)
+        if (tcp_packets_in_flight(tp) == 0) {
                tcp_ca_event(sk, CA_EVENT_TX_START);
+                skb->ooo_okay = 1;
+        } else
+                skb->ooo_okay = 0;
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
@@ -891,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
                              tcp_skb_pcount(skb));
-        err = icsk->icsk_af_ops->queue_xmit(skb);
+        err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
        if (likely(err <= 0))
                return err;
@@ -995,7 +1003,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
        int nlen;
        u8 flags;
-        BUG_ON(len > skb->len);
+        if (WARN_ON(len > skb->len))
+                return -EINVAL;
        nsize = skb_headlen(skb) - len;
        if (nsize < 0)
@@ -1342,7 +1351,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
        return 0;
 }
-/* Intialize TSO state of a skb.
+/* Initialize TSO state of a skb.
 * This must be invoked the first time we consider transmitting
 * SKB onto the wire.
 */
@@ -1376,9 +1385,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
                                  const struct sk_buff *skb,
                                  unsigned mss_now, int nonagle)
 {
-        return (skb->len < mss_now &&
+        return skb->len < mss_now &&
                ((nonagle & TCP_NAGLE_CORK) ||
-                 (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
+                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
 }
 /* Return non-zero if the Nagle test allows this packet to be
@@ -1449,10 +1458,10 @@ int tcp_may_send_now(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb = tcp_send_head(sk);
-        return (skb &&
+        return skb &&
                tcp_snd_test(sk, skb, tcp_current_mss(sk),
                             (tcp_skb_is_last(sk, skb) ?
-                              tp->nonagle : TCP_NAGLE_PUSH)));
+                              tp->nonagle : TCP_NAGLE_PUSH));
 }
 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1519,6 +1528,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 send_win, cong_win, limit, in_flight;
+        int win_divisor;
        if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
                goto send_now;
@@ -1550,13 +1560,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
        if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
                goto send_now;
-        if (sysctl_tcp_tso_win_divisor) {
+        win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+        if (win_divisor) {
                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
                /* If at least some fraction of a window is available,
                 * just use it.
                 */
-                chunk /= sysctl_tcp_tso_win_divisor;
+                chunk /= win_divisor;
                if (limit >= chunk)
                        goto send_now;
        } else {
@@ -2152,7 +2163,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                if (!tp->retrans_stamp)
                        tp->retrans_stamp = TCP_SKB_CB(skb)->when;
-                tp->undo_retrans++;
+                tp->undo_retrans += tcp_skb_pcount(skb);
                /* snd_nxt is stored to detect loss of retransmitted segment,
                 * see tcp_input.c tcp_sacktag_write_queue().
@@ -2421,7 +2432,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        skb_dst_set(skb, dst_clone(dst));
-        mss = dst_metric(dst, RTAX_ADVMSS);
+        mss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
                mss = tp->rx_opt.user_mss;
@@ -2429,6 +2440,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                __u8 rcv_wscale;
                /* Set this up on the first call only */
                req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+                /* limit the window selection if the user enforce a smaller rx buffer */
+                if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+                    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+                        req->window_clamp = tcp_full_space(sk);
                /* tcp_full_space because it is guaranteed to be the first packet */
                tcp_select_initial_window(tcp_full_space(sk),
                        mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2549,12 +2566,17 @@ static void tcp_connect_init(struct sock *sk)
        if (!tp->window_clamp)
                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
-        tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+        tp->advmss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
                tp->advmss = tp->rx_opt.user_mss;
        tcp_initialize_rcv_mss(sk);
+        /* limit the window selection if the user enforce a smaller rx buffer */
+        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+            (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+                tp->window_clamp = tcp_full_space(sk);
        tcp_select_initial_window(tcp_full_space(sk),
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
                                  &tp->rcv_wnd,
@@ -2587,6 +2609,7 @@ int tcp_connect(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
+        int err;
        tcp_connect_init(sk);
@@ -2609,7 +2632,9 @@ int tcp_connect(struct sock *sk)
        sk->sk_wmem_queued += buff->truesize;
        sk_mem_charge(sk, buff->truesize);
        tp->packets_out += tcp_skb_pcount(buff);
-        tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+        err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+        if (err == -ECONNREFUSED)
+                return err;
        /* We change tp->snd_nxt after the tcp_transmit_skb() call
         * in order to make this packet get counted in tcpOutSegs.
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e8..85ee7eb7e38e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static int tcpprobe_sprint(char *tbuf, int n)
        struct timespec tv
                = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
-        return snprintf(tbuf, n,
+        return scnprintf(tbuf, n,
                        "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
                        (unsigned long) tv.tv_sec,
                        (unsigned long) tv.tv_nsec,
@@ -174,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
                return -EINVAL;
        while (cnt < len) {
-                char tbuf[128];
+                char tbuf[164];
                int width;
                /* Wait for data in buffer */
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
        .owner   = THIS_MODULE,
        .open    = tcpprobe_open,
        .read    = tcpprobe_read,
+        .llseek  = noop_llseek,
 };
 static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2b..8ce55b8aaec8 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
 }
-static struct tcp_congestion_ops tcp_scalable = {
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
        .ssthresh       = tcp_scalable_ssthresh,
        .cong_avoid     = tcp_scalable_cong_avoid,
        .min_cwnd       = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74c54b30600f..ecd44b0c45f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -140,10 +140,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 */
 static bool retransmits_timed_out(struct sock *sk,
                                  unsigned int boundary,
+                                  unsigned int timeout,
                                  bool syn_set)
 {
-        unsigned int timeout, linear_backoff_thresh;
+        unsigned int linear_backoff_thresh, start_ts;
-        unsigned int start_ts;
        unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
        if (!inet_csk(sk)->icsk_retransmits)
@@ -154,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
        else
                start_ts = tcp_sk(sk)->retrans_stamp;
-        linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+        if (likely(timeout == 0)) {
+                linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
-        if (boundary <= linear_backoff_thresh)
-                timeout = ((2 << boundary) - 1) * rto_base;
-        else
-                timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
-                          (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+                if (boundary <= linear_backoff_thresh)
+                        timeout = ((2 << boundary) - 1) * rto_base;
+                else
+                        timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+                                (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+        }
        return (tcp_time_stamp - start_ts) >= timeout;
 }
@@ -178,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk)
                retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
                syn_set = 1;
        } else {
-                if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
+                if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
                        /* Black hole detection */
                        tcp_mtu_probing(icsk, sk);
@@ -191,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
                        retry_until = tcp_orphan_retries(sk, alive);
                        do_reset = alive ||
-                                   !retransmits_timed_out(sk, retry_until, 0);
+                                !retransmits_timed_out(sk, retry_until, 0, 0);
                        if (tcp_out_of_resources(sk, do_reset))
                                return 1;
                }
        }
-        if (retransmits_timed_out(sk, retry_until, syn_set)) {
+        if (retransmits_timed_out(sk, retry_until,
+                                  syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
                /* Has it gone just too far? */
                tcp_write_err(sk);
                return 1;
@@ -257,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
                tcp_send_ack(sk);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
        }
-        TCP_CHECK_TIMER(sk);
 out:
        if (tcp_memory_pressure)
@@ -365,18 +366,19 @@ void tcp_retransmit_timer(struct sock *sk)
        if (icsk->icsk_retransmits == 0) {
                int mib_idx;
-                if (icsk->icsk_ca_state == TCP_CA_Disorder) {
+                if (icsk->icsk_ca_state == TCP_CA_Recovery) {
-                        if (tcp_is_sack(tp))
-                                mib_idx = LINUX_MIB_TCPSACKFAILURES;
-                        else
-                                mib_idx = LINUX_MIB_TCPRENOFAILURES;
-                } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
                        if (tcp_is_sack(tp))
                                mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
                        else
                                mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
                } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
                        mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+                } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+                           tp->sacked_out) {
+                        if (tcp_is_sack(tp))
+                                mib_idx = LINUX_MIB_TCPSACKFAILURES;
+                        else
+                                mib_idx = LINUX_MIB_TCPRENOFAILURES;
                } else {
                        mib_idx = LINUX_MIB_TCPTIMEOUTS;
                }
@@ -440,7 +442,7 @@ out_reset_timer:
                icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
        }
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-        if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
+        if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
                __sk_dst_reset(sk);
 out:;
@@ -478,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
                tcp_probe_timer(sk);
                break;
        }
-        TCP_CHECK_TIMER(sk);
 out:
        sk_mem_reclaim(sk);
@@ -560,7 +561,14 @@ static void tcp_keepalive_timer (unsigned long data)
        elapsed = keepalive_time_elapsed(tp);
        if (elapsed >= keepalive_time_when(tp)) {
-                if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
+                /* If the TCP_USER_TIMEOUT option is enabled, use that
+                 * to determine when to timeout instead.
+                 */
+                if ((icsk->icsk_user_timeout != 0 &&
+                    elapsed >= icsk->icsk_user_timeout &&
+                    icsk->icsk_probes_out > 0) ||
+                    (icsk->icsk_user_timeout == 0 &&
+                    icsk->icsk_probes_out >= keepalive_probes(tp))) {
                        tcp_send_active_reset(sk, GFP_ATOMIC);
                        tcp_write_err(sk);
                        goto out;
@@ -579,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
                elapsed = keepalive_time_when(tp) - elapsed;
        }
-        TCP_CHECK_TIMER(sk);
        sk_mem_reclaim(sk);
 resched:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7d..80fa2bfd7ede 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
-static struct tcp_congestion_ops tcp_vegas = {
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_vegas_init,
        .ssthresh       = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index b612acf76183..ac43cd747bce 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
 *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
 *    IEEE Journal on Selected Areas in Communication,
 *    Feb. 2003.
- *      See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ *      See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
 */
 #include <linux/mm.h>
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
                return max(tp->snd_cwnd >> 1U, 2U);
 }
-static struct tcp_congestion_ops tcp_veno = {
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_veno_init,
        .ssthresh       = tcp_veno_ssthresh,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..1b91bf48e277 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
 */
 static inline u32 westwood_do_filter(u32 a, u32 b)
 {
-        return (((7 * a) + b) >> 3);
+        return ((7 * a) + b) >> 3;
 }
 static void westwood_filter(struct westwood *w, u32 delta)
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
 }
-static struct tcp_congestion_ops tcp_westwood = {
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
        .init           = tcp_westwood_init,
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f240358892..05c3b6f0e8e1 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -20,7 +20,7 @@
 #define TCP_YEAH_DELTA        3 //log minimum fraction of cwnd to be removed on loss
 #define TCP_YEAH_EPSILON      1 //log maximum fraction to be removed on early decongestion
 #define TCP_YEAH_PHY          8 //lin maximum delta from base
-#define TCP_YEAH_RHO         16 //lin minumum number of consecutive rtt to consider competition on loss
+#define TCP_YEAH_RHO         16 //lin minimum number of consecutive rtt to consider competition on loss
 #define TCP_YEAH_ZETA        50 //lin minimum number of state switchs to reset reno_count
 #define TCP_SCALABLE_AI_CNT      100U
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
        return tp->snd_cwnd - reduction;
 }
-static struct tcp_congestion_ops tcp_yeah = {
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_yeah_init,
        .ssthresh       = tcp_yeah_ssthresh,
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808a..ac3b3ee4b07c 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,32 +14,37 @@
 #include <net/protocol.h>
 #include <net/xfrm.h>
-static struct xfrm_tunnel *tunnel4_handlers;
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
-static struct xfrm_tunnel *tunnel64_handlers;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
 static DEFINE_MUTEX(tunnel4_mutex);
-static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
 {
        return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
 }
 int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
 {
-        struct xfrm_tunnel **pprev;
+        struct xfrm_tunnel __rcu **pprev;
+        struct xfrm_tunnel *t;
        int ret = -EEXIST;
        int priority = handler->priority;
        mutex_lock(&tunnel4_mutex);
-        for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) {
+        for (pprev = fam_handlers(family);
-                if ((*pprev)->priority > priority)
+             (t = rcu_dereference_protected(*pprev,
+                        lockdep_is_held(&tunnel4_mutex))) != NULL;
+             pprev = &t->next) {
+                if (t->priority > priority)
                        break;
-                if ((*pprev)->priority == priority)
+                if (t->priority == priority)
                        goto err;
        }
        handler->next = *pprev;
-        *pprev = handler;
+        rcu_assign_pointer(*pprev, handler);
        ret = 0;
@@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register);
 int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
 {
-        struct xfrm_tunnel **pprev;
+        struct xfrm_tunnel __rcu **pprev;
+        struct xfrm_tunnel *t;
        int ret = -ENOENT;
        mutex_lock(&tunnel4_mutex);
-        for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) {
+        for (pprev = fam_handlers(family);
-                if (*pprev == handler) {
+             (t = rcu_dereference_protected(*pprev,
+                        lockdep_is_held(&tunnel4_mutex))) != NULL;
+             pprev = &t->next) {
+                if (t == handler) {
                        *pprev = handler->next;
                        ret = 0;
                        break;
@@ -73,6 +82,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
 }
 EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+#define for_each_tunnel_rcu(head, handler)              \
+        for (handler = rcu_dereference(head);           \
+             handler != NULL;                           \
+             handler = rcu_dereference(handler->next))  \
+        
 static int tunnel4_rcv(struct sk_buff *skb)
 {
        struct xfrm_tunnel *handler;
@@ -80,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto drop;
-        for (handler = tunnel4_handlers; handler; handler = handler->next)
+        for_each_tunnel_rcu(tunnel4_handlers, handler)
                if (!handler->handler(skb))
                        return 0;
@@ -99,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
                goto drop;
-        for (handler = tunnel64_handlers; handler; handler = handler->next)
+        for_each_tunnel_rcu(tunnel64_handlers, handler)
                if (!handler->handler(skb))
                        return 0;
@@ -115,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
 {
        struct xfrm_tunnel *handler;
-        for (handler = tunnel4_handlers; handler; handler = handler->next)
+        for_each_tunnel_rcu(tunnel4_handlers, handler)
                if (!handler->err_handler(skb, info))
                        break;
 }
@@ -125,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
 {
        struct xfrm_tunnel *handler;
-        for (handler = tunnel64_handlers; handler; handler = handler->next)
+        for_each_tunnel_rcu(tunnel64_handlers, handler)
                if (!handler->err_handler(skb, info))
                        break;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fb23c2e63b52..198f75b7bdd3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -110,7 +110,7 @@
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
-int sysctl_udp_mem[3] __read_mostly;
+long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 int sysctl_udp_rmem_min __read_mostly;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
 int sysctl_udp_wmem_min __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_wmem_min);
-atomic_t udp_memory_allocated;
+atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 #define MAX_UDP_PORTS 65536
@@ -189,7 +189,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 *  @sk:          socket struct in question
 *  @snum:        port number to look up
 *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
- *  @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
+ *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
 *                   with NULL address
 */
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
@@ -430,7 +430,7 @@ begin:
        if (result) {
 exact_match:
-                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+                if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
                        result = NULL;
                else if (unlikely(compute_score2(result, net, saddr, sport,
                                  daddr, hnum, dif) < badness)) {
@@ -500,7 +500,7 @@ begin:
                goto begin;
        if (result) {
-                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+                if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
                        result = NULL;
                else if (unlikely(compute_score(result, net, saddr, hnum, sport,
                                  daddr, dport, dif) < badness)) {
@@ -578,7 +578,7 @@ found:
 void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 {
        struct inet_sock *inet;
-        struct iphdr *iph = (struct iphdr *)skb->data;
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
@@ -663,75 +663,71 @@ void udp_flush_pending_frames(struct sock *sk)
 EXPORT_SYMBOL(udp_flush_pending_frames);
 /**
- *      udp4_hwcsum_outgoing  -  handle outgoing HW checksumming
+ *      udp4_hwcsum  -  handle outgoing HW checksumming
- *      @sk:    socket we are sending on
 *      @skb:   sk_buff containing the filled-in UDP header
 *              (checksum field must be zeroed out)
+ *      @src:   source IP address
+ *      @dst:   destination IP address
 */
-static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
+static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
-                                 __be32 src, __be32 dst, int len)
 {
-        unsigned int offset;
        struct udphdr *uh = udp_hdr(skb);
+        struct sk_buff *frags = skb_shinfo(skb)->frag_list;
+        int offset = skb_transport_offset(skb);
+        int len = skb->len - offset;
+        int hlen = len;
        __wsum csum = 0;
-        if (skb_queue_len(&sk->sk_write_queue) == 1) {
+        if (!frags) {
                /*
                 * Only one fragment on the socket.
                 */
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
-                uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
+                uh->check = ~csum_tcpudp_magic(src, dst, len,
+                                               IPPROTO_UDP, 0);
        } else {
                /*
                 * HW-checksum won't work as there are two or more
                 * fragments on the socket so that all csums of sk_buffs
                 * should be together
                 */
-                offset = skb_transport_offset(skb);
+                do {
-                skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+                        csum = csum_add(csum, frags->csum);
+                        hlen -= frags->len;
+                } while ((frags = frags->next));
+                csum = skb_checksum(skb, offset, hlen, csum);
                skb->ip_summed = CHECKSUM_NONE;
-                skb_queue_walk(&sk->sk_write_queue, skb) {
-                        csum = csum_add(csum, skb->csum);
-                }
                uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        }
 }
-/*
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
- * Push out all pending data as one UDP datagram. Socket is locked.
- */
-static int udp_push_pending_frames(struct sock *sk)
 {
-        struct udp_sock  *up = udp_sk(sk);
+        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(sk);
-        struct flowi *fl = &inet->cork.fl;
-        struct sk_buff *skb;
        struct udphdr *uh;
        int err = 0;
        int is_udplite = IS_UDPLITE(sk);
+        int offset = skb_transport_offset(skb);
+        int len = skb->len - offset;
        __wsum csum = 0;
-        /* Grab the skbuff where UDP header space exists. */
-        if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
-                goto out;
        /*
         * Create a UDP header
         */
        uh = udp_hdr(skb);
-        uh->source = fl->fl_ip_sport;
+        uh->source = inet->inet_sport;
-        uh->dest = fl->fl_ip_dport;
+        uh->dest = fl4->fl4_dport;
-        uh->len = htons(up->len);
+        uh->len = htons(len);
        uh->check = 0;
        if (is_udplite)                                  /*     UDP-Lite      */
-                csum  = udplite_csum_outgoing(sk, skb);
+                csum = udplite_csum(skb);
        else if (sk->sk_no_check == UDP_CSUM_NOXMIT) {   /* UDP csum disabled */
@@ -740,20 +736,20 @@ static int udp_push_pending_frames(struct sock *sk)
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
-                udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len);
+                udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
                goto send;
-        } else                                           /*   `normal' UDP    */
+        } else
-                csum = udp_csum_outgoing(sk, skb);
+                csum = udp_csum(skb);
        /* add protocol-dependent pseudo-header */
-        uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
+        uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
                                      sk->sk_protocol, csum);
        if (uh->check == 0)
                uh->check = CSUM_MANGLED_0;
 send:
-        err = ip_push_pending_frames(sk);
+        err = ip_send_skb(skb);
        if (err) {
                if (err == -ENOBUFS && !inet->recverr) {
                        UDP_INC_STATS_USER(sock_net(sk),
@@ -763,6 +759,26 @@ send:
        } else
                UDP_INC_STATS_USER(sock_net(sk),
                                   UDP_MIB_OUTDATAGRAMS, is_udplite);
+        return err;
+}
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+static int udp_push_pending_frames(struct sock *sk)
+{
+        struct udp_sock  *up = udp_sk(sk);
+        struct inet_sock *inet = inet_sk(sk);
+        struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+        struct sk_buff *skb;
+        int err = 0;
+        skb = ip_finish_skb(sk, fl4);
+        if (!skb)
+                goto out;
+        err = udp_send_skb(skb, fl4);
 out:
        up->len = 0;
        up->pending = 0;
@@ -774,6 +790,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 {
        struct inet_sock *inet = inet_sk(sk);
        struct udp_sock *up = udp_sk(sk);
+        struct flowi4 fl4_stack;
+        struct flowi4 *fl4;
        int ulen = len;
        struct ipcm_cookie ipc;
        struct rtable *rt = NULL;
@@ -785,6 +803,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        int err, is_udplite = IS_UDPLITE(sk);
        int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
        int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+        struct sk_buff *skb;
+        struct ip_options_data opt_copy;
        if (len > 0xFFFF)
                return -EMSGSIZE;
@@ -797,8 +817,11 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                return -EOPNOTSUPP;
        ipc.opt = NULL;
-        ipc.shtx.flags = 0;
+        ipc.tx_flags = 0;
+        getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+        fl4 = &inet->cork.fl.u.ip4;
        if (up->pending) {
                /*
                 * There are pending frames.
@@ -845,7 +868,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        ipc.addr = inet->inet_saddr;
        ipc.oif = sk->sk_bound_dev_if;
-        err = sock_tx_timestamp(msg, sk, &ipc.shtx);
+        err = sock_tx_timestamp(sk, &ipc.tx_flags);
        if (err)
                return err;
        if (msg->msg_controllen) {
@@ -856,22 +879,32 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        free = 1;
                connected = 0;
        }
-        if (!ipc.opt)
+        if (!ipc.opt) {
-                ipc.opt = inet->opt;
+                struct ip_options_rcu *inet_opt;
+                rcu_read_lock();
+                inet_opt = rcu_dereference(inet->inet_opt);
+                if (inet_opt) {
+                        memcpy(&opt_copy, inet_opt,
+                               sizeof(*inet_opt) + inet_opt->opt.optlen);
+                        ipc.opt = &opt_copy.opt;
+                }
+                rcu_read_unlock();
+        }
        saddr = ipc.addr;
        ipc.addr = faddr = daddr;
-        if (ipc.opt && ipc.opt->srr) {
+        if (ipc.opt && ipc.opt->opt.srr) {
                if (!daddr)
                        return -EINVAL;
-                faddr = ipc.opt->faddr;
+                faddr = ipc.opt->opt.faddr;
                connected = 0;
        }
        tos = RT_TOS(inet->tos);
        if (sock_flag(sk, SOCK_LOCALROUTE) ||
            (msg->msg_flags & MSG_DONTROUTE) ||
-            (ipc.opt && ipc.opt->is_strictroute)) {
+            (ipc.opt && ipc.opt->opt.is_strictroute)) {
                tos |= RTO_ONLINK;
                connected = 0;
        }
@@ -888,22 +921,19 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                rt = (struct rtable *)sk_dst_check(sk, 0);
        if (rt == NULL) {
-                struct flowi fl = { .oif = ipc.oif,
-                                    .mark = sk->sk_mark,
-                                    .nl_u = { .ip4_u =
-                                              { .daddr = faddr,
-                                                .saddr = saddr,
-                                                .tos = tos } },
-                                    .proto = sk->sk_protocol,
-                                    .flags = inet_sk_flowi_flags(sk),
-                                    .uli_u = { .ports =
-                                               { .sport = inet->inet_sport,
-                                                 .dport = dport } } };
                struct net *net = sock_net(sk);
-                security_sk_classify_flow(sk, &fl);
+                fl4 = &fl4_stack;
-                err = ip_route_output_flow(net, &rt, &fl, sk, 1);
+                flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
-                if (err) {
+                                   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+                                   inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
+                                   faddr, saddr, dport, inet->inet_sport);
+                security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+                rt = ip_route_output_flow(net, fl4, sk);
+                if (IS_ERR(rt)) {
+                        err = PTR_ERR(rt);
+                        rt = NULL;
                        if (err == -ENETUNREACH)
                                IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
                        goto out;
@@ -921,9 +951,20 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                goto do_confirm;
 back_from_confirm:
-        saddr = rt->rt_src;
+        saddr = fl4->saddr;
        if (!ipc.addr)
-                daddr = ipc.addr = rt->rt_dst;
+                daddr = ipc.addr = fl4->daddr;
+        /* Lockless fast path for the non-corking case. */
+        if (!corkreq) {
+                skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
+                                  sizeof(struct udphdr), &ipc, &rt,
+                                  msg->msg_flags);
+                err = PTR_ERR(skb);
+                if (skb && !IS_ERR(skb))
+                        err = udp_send_skb(skb, fl4);
+                goto out;
+        }
        lock_sock(sk);
        if (unlikely(up->pending)) {
@@ -938,18 +979,18 @@ back_from_confirm:
        /*
         *      Now cork the socket to pend data.
         */
-        inet->cork.fl.fl4_dst = daddr;
+        fl4 = &inet->cork.fl.u.ip4;
-        inet->cork.fl.fl_ip_dport = dport;
+        fl4->daddr = daddr;
-        inet->cork.fl.fl4_src = saddr;
+        fl4->saddr = saddr;
-        inet->cork.fl.fl_ip_sport = inet->inet_sport;
+        fl4->fl4_dport = dport;
+        fl4->fl4_sport = inet->inet_sport;
        up->pending = AF_INET;
 do_append_data:
        up->len += ulen;
-        getfrag  =  is_udplite ?  udplite_getfrag : ip_generic_getfrag;
+        err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,
-        err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
+                             sizeof(struct udphdr), &ipc, &rt,
-                        sizeof(struct udphdr), &ipc, &rt,
+                             corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
-                        corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
        if (err)
                udp_flush_pending_frames(sk);
        else if (!corkreq)
@@ -989,6 +1030,7 @@ EXPORT_SYMBOL(udp_sendmsg);
 int udp_sendpage(struct sock *sk, struct page *page, int offset,
                 size_t size, int flags)
 {
+        struct inet_sock *inet = inet_sk(sk);
        struct udp_sock *up = udp_sk(sk);
        int ret;
@@ -1013,7 +1055,8 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
                return -EINVAL;
        }
-        ret = ip_append_page(sk, page, offset, size, flags);
+        ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
+                             page, offset, size, flags);
        if (ret == -EOPNOTSUPP) {
                release_sock(sk);
                return sock_no_sendpage(sk->sk_socket, page, offset,
@@ -1206,6 +1249,9 @@ csum_copy_err:
        if (noblock)
                return -EAGAIN;
+        /* starting over for a new packet */
+        msg->msg_flags &= ~MSG_TRUNC;
        goto try_again;
 }
@@ -1413,7 +1459,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                }
        }
-        if (sk->sk_filter) {
+        if (rcu_dereference_raw(sk->sk_filter)) {
                if (udp_lib_checksum_complete(skb))
                        goto drop;
        }
@@ -1899,6 +1945,7 @@ struct proto udp_prot = {
        .compat_setsockopt = compat_udp_setsockopt,
        .compat_getsockopt = compat_udp_getsockopt,
 #endif
+        .clear_sk          = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udp_prot);
@@ -2046,7 +2093,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
        __u16 srcp        = ntohs(inet->inet_sport);
        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
-                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
+                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
@@ -2162,16 +2209,10 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 void __init udp_init(void)
 {
-        unsigned long nr_pages, limit;
+        unsigned long limit;
        udp_table_init(&udp_table, "UDP");
-        /* Set the pressure threshold up by the same strategy of TCP. It is a
+        limit = nr_free_buffer_pages() / 8;
-         * fraction of global memory that is up to 1/2 at 256 MB, decreasing
-         * toward zero with the amount of memory, with a floor of 128 pages.
-         */
-        nr_pages = totalram_pages - totalhigh_pages;
-        limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
-        limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
        limit = max(limit, 128UL);
        sysctl_udp_mem[0] = limit / 4 * 3;
        sysctl_udp_mem[1] = limit;
@@ -2200,7 +2241,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
        return 0;
 }
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
+struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        unsigned int mss;
@@ -2228,7 +2269,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
        /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
         * do checksum of UDP packets sent as multiple IP fragments.
         */
-        offset = skb->csum_start - skb_headroom(skb);
+        offset = skb_checksum_start_offset(skb);
        csum = skb_checksum(skb, offset, skb->len - offset, 0);
        offset += skb->csum_offset;
        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa928fa9..aee9963f7f5a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -57,6 +57,7 @@ struct proto 	udplite_prot = {
        .compat_setsockopt = compat_udp_setsockopt,
        .compat_getsockopt = compat_udp_getsockopt,
 #endif
+        .clear_sk          = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 6f368413eb0e..534972e114ac 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -56,7 +56,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
                0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
        ip_select_ident(top_iph, dst->child, NULL);
-        top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+        top_iph->ttl = ip4_dst_hoplimit(dst->child);
        top_iph->saddr = x->props.saddr.a4;
        top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 571aa96a175c..327a617d594c 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -32,7 +32,12 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
        dst = skb_dst(skb);
        mtu = dst_mtu(dst);
        if (skb->len > mtu) {
-                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                if (skb->sk)
+                        ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr,
+                                       inet_sk(skb->sk)->inet_dport, mtu);
+                else
+                        icmp_send(skb, ICMP_DEST_UNREACH,
+                                  ICMP_FRAG_NEEDED, htonl(mtu));
                ret = -EMSGSIZE;
        }
 out:
@@ -69,7 +74,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(xfrm4_prepare_output);
-static int xfrm4_output_finish(struct sk_buff *skb)
+int xfrm4_output_finish(struct sk_buff *skb)
 {
 #ifdef CONFIG_NETFILTER
        if (!skb_dst(skb)->xfrm) {
@@ -86,7 +91,11 @@ static int xfrm4_output_finish(struct sk_buff *skb)
 int xfrm4_output(struct sk_buff *skb)
 {
+        struct dst_entry *dst = skb_dst(skb);
+        struct xfrm_state *x = dst->xfrm;
        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
-                            NULL, skb_dst(skb)->dev, xfrm4_output_finish,
+                            NULL, dst->dev,
+                            x->outer_mode->afinfo->output_finish,
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a580349f0b8a..981e43eaf704 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -11,57 +11,60 @@
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/inetdevice.h>
+#include <linux/if_tunnel.h>
 #include <net/dst.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
-static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
-                                          xfrm_address_t *saddr,
+                                            int tos,
-                                          xfrm_address_t *daddr)
+                                            const xfrm_address_t *saddr,
+                                            const xfrm_address_t *daddr)
 {
-        struct flowi fl = {
-                .nl_u = {
-                        .ip4_u = {
-                                .tos = tos,
-                                .daddr = daddr->a4,
-                        },
-                },
-        };
-        struct dst_entry *dst;
        struct rtable *rt;
-        int err;
+        memset(fl4, 0, sizeof(*fl4));
+        fl4->daddr = daddr->a4;
+        fl4->flowi4_tos = tos;
        if (saddr)
-                fl.fl4_src = saddr->a4;
+                fl4->saddr = saddr->a4;
+        rt = __ip_route_output_key(net, fl4);
+        if (!IS_ERR(rt))
+                return &rt->dst;
-        err = __ip_route_output_key(net, &rt, &fl);
+        return ERR_CAST(rt);
-        dst = &rt->dst;
+}
-        if (err)
-                dst = ERR_PTR(err);
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
-        return dst;
+                                          const xfrm_address_t *saddr,
+                                          const xfrm_address_t *daddr)
+{
+        struct flowi4 fl4;
+        return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
 }
 static int xfrm4_get_saddr(struct net *net,
                           xfrm_address_t *saddr, xfrm_address_t *daddr)
 {
        struct dst_entry *dst;
-        struct rtable *rt;
+        struct flowi4 fl4;
-        dst = xfrm4_dst_lookup(net, 0, NULL, daddr);
+        dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
        if (IS_ERR(dst))
                return -EHOSTUNREACH;
-        rt = (struct rtable *)dst;
+        saddr->a4 = fl4.saddr;
-        saddr->a4 = rt->rt_src;
        dst_release(dst);
        return 0;
 }
-static int xfrm4_get_tos(struct flowi *fl)
+static int xfrm4_get_tos(const struct flowi *fl)
 {
-        return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
+        return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
 }
 static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -71,19 +74,22 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 }
 static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
-                          struct flowi *fl)
+                          const struct flowi *fl)
 {
        struct rtable *rt = (struct rtable *)xdst->route;
+        const struct flowi4 *fl4 = &fl->u.ip4;
-        xdst->u.rt.fl = *fl;
+        rt->rt_key_dst = fl4->daddr;
+        rt->rt_key_src = fl4->saddr;
+        rt->rt_key_tos = fl4->flowi4_tos;
+        rt->rt_route_iif = fl4->flowi4_iif;
+        rt->rt_iif = fl4->flowi4_iif;
+        rt->rt_oif = fl4->flowi4_oif;
+        rt->rt_mark = fl4->flowi4_mark;
        xdst->u.dst.dev = dev;
        dev_hold(dev);
-        xdst->u.rt.idev = in_dev_get(dev);
-        if (!xdst->u.rt.idev)
-                return -ENODEV;
        xdst->u.rt.peer = rt->peer;
        if (rt->peer)
                atomic_inc(&rt->peer->refcnt);
@@ -104,11 +110,12 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 static void
 _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 {
-        struct iphdr *iph = ip_hdr(skb);
+        const struct iphdr *iph = ip_hdr(skb);
        u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
+        struct flowi4 *fl4 = &fl->u.ip4;
-        memset(fl, 0, sizeof(struct flowi));
+        memset(fl4, 0, sizeof(struct flowi4));
-        fl->mark = skb->mark;
+        fl4->flowi4_mark = skb->mark;
        if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
                switch (iph->protocol) {
@@ -121,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
                            pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                __be16 *ports = (__be16 *)xprth;
-                                fl->fl_ip_sport = ports[!!reverse];
+                                fl4->fl4_sport = ports[!!reverse];
-                                fl->fl_ip_dport = ports[!reverse];
+                                fl4->fl4_dport = ports[!reverse];
                        }
                        break;
@@ -130,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
                        if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
                                u8 *icmp = xprth;
-                                fl->fl_icmp_type = icmp[0];
+                                fl4->fl4_icmp_type = icmp[0];
-                                fl->fl_icmp_code = icmp[1];
+                                fl4->fl4_icmp_code = icmp[1];
                        }
                        break;
@@ -139,7 +146,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
                        if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                __be32 *ehdr = (__be32 *)xprth;
-                                fl->fl_ipsec_spi = ehdr[0];
+                                fl4->fl4_ipsec_spi = ehdr[0];
                        }
                        break;
@@ -147,7 +154,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
                        if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
                                __be32 *ah_hdr = (__be32*)xprth;
-                                fl->fl_ipsec_spi = ah_hdr[1];
+                                fl4->fl4_ipsec_spi = ah_hdr[1];
                        }
                        break;
@@ -155,18 +162,32 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
                        if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
                                __be16 *ipcomp_hdr = (__be16 *)xprth;
-                                fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+                                fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+                        }
+                        break;
+                case IPPROTO_GRE:
+                        if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
+                                __be16 *greflags = (__be16 *)xprth;
+                                __be32 *gre_hdr = (__be32 *)xprth;
+                                if (greflags[0] & GRE_KEY) {
+                                        if (greflags[0] & GRE_CSUM)
+                                                gre_hdr++;
+                                        fl4->fl4_gre_key = gre_hdr[1];
+                                }
                        }
                        break;
                default:
-                        fl->fl_ipsec_spi = 0;
+                        fl4->fl4_ipsec_spi = 0;
                        break;
                }
        }
-        fl->proto = iph->protocol;
+        fl4->flowi4_proto = iph->protocol;
-        fl->fl4_dst = reverse ? iph->saddr : iph->daddr;
+        fl4->daddr = reverse ? iph->saddr : iph->daddr;
-        fl->fl4_src = reverse ? iph->daddr : iph->saddr;
+        fl4->saddr = reverse ? iph->daddr : iph->saddr;
-        fl->fl4_tos = iph->tos;
+        fl4->flowi4_tos = iph->tos;
 }
 static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -174,7 +195,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
        struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
        xfrm4_policy_afinfo.garbage_collect(net);
-        return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+        return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -189,37 +210,20 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 {
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-        if (likely(xdst->u.rt.idev))
+        dst_destroy_metrics_generic(dst);
-                in_dev_put(xdst->u.rt.idev);
        if (likely(xdst->u.rt.peer))
                inet_putpeer(xdst->u.rt.peer);
        xfrm_dst_destroy(xdst);
 }
 static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                             int unregister)
 {
-        struct xfrm_dst *xdst;
        if (!unregister)
                return;
-        xdst = (struct xfrm_dst *)dst;
-        if (xdst->u.rt.idev->dev == dev) {
-                struct in_device *loopback_idev =
-                        in_dev_get(dev_net(dev)->loopback_dev);
-                BUG_ON(!loopback_idev);
-                do {
-                        in_dev_put(xdst->u.rt.idev);
-                        xdst->u.rt.idev = loopback_idev;
-                        in_dev_hold(loopback_idev);
-                        xdst = (struct xfrm_dst *)xdst->u.dst.child;
-                } while (xdst->u.dst.xfrm);
-                __in_dev_put(loopback_idev);
-        }
        xfrm_dst_ifdown(dst, dev);
 }
@@ -228,11 +232,11 @@ static struct dst_ops xfrm4_dst_ops = {
        .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   xfrm4_garbage_collect,
        .update_pmtu =          xfrm4_update_pmtu,
+        .cow_metrics =          dst_cow_metrics_generic,
        .destroy =              xfrm4_dst_destroy,
        .ifdown =               xfrm4_dst_ifdown,
        .local_out =            __ip_local_out,
        .gc_thresh =            1024,
-        .entries =              ATOMIC_INIT(0),
 };
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -244,6 +248,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
        .get_tos =              xfrm4_get_tos,
        .init_path =            xfrm4_init_path,
        .fill_dst =             xfrm4_fill_dst,
+        .blackhole_route =      ipv4_blackhole_route,
 };
 #ifdef CONFIG_SYSCTL
@@ -288,6 +293,7 @@ void __init xfrm4_init(int rt_max_size)
         * and start cleaning when were 1/2 full
         */
        xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+        dst_entries_init(&xfrm4_dst_ops);
        xfrm4_state_init();
        xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624eccc..d9ac0a0058b5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x)
 }
 static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
 {
-        sel->daddr.a4 = fl->fl4_dst;
+        const struct flowi4 *fl4 = &fl->u.ip4;
-        sel->saddr.a4 = fl->fl4_src;
-        sel->dport = xfrm_flowi_dport(fl);
+        sel->daddr.a4 = fl4->daddr;
+        sel->saddr.a4 = fl4->saddr;
+        sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
        sel->dport_mask = htons(0xffff);
-        sel->sport = xfrm_flowi_sport(fl);
+        sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
        sel->sport_mask = htons(0xffff);
        sel->family = AF_INET;
        sel->prefixlen_d = 32;
        sel->prefixlen_s = 32;
-        sel->proto = fl->proto;
+        sel->proto = fl4->flowi4_proto;
-        sel->ifindex = fl->oif;
+        sel->ifindex = fl4->flowi4_oif;
 }
 static void
-xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
-                   xfrm_address_t *daddr, xfrm_address_t *saddr)
+                   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
 {
        x->id = tmpl->id;
        if (x->id.daddr.a4 == 0)
@@ -53,7 +55,7 @@ xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
 int xfrm4_extract_header(struct sk_buff *skb)
 {
-        struct iphdr *iph = ip_hdr(skb);
+        const struct iphdr *iph = ip_hdr(skb);
        XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
        XFRM_MODE_SKB_CB(skb)->id = iph->id;
@@ -76,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
        .init_tempsel           = __xfrm4_init_tempsel,
        .init_temprop           = xfrm4_init_temprop,
        .output                 = xfrm4_output,
+        .output_finish          = xfrm4_output_finish,
        .extract_input          = xfrm4_extract_input,
        .extract_output         = xfrm4_extract_output,
        .transport_finish       = xfrm4_transport_finish,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
        return -ENOENT;
 }
-static struct xfrm_tunnel xfrm_tunnel_handler = {
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
        .handler        =       xfrm_tunnel_rcv,
        .err_handler    =       xfrm_tunnel_err,
        .priority       =       2,
 };
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static struct xfrm_tunnel xfrm64_tunnel_handler = {
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
        .handler        =       xfrm_tunnel_rcv,
        .err_handler    =       xfrm_tunnel_err,
        .priority       =       2,
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/ipv4
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)