59 files changed, 4048 insertions, 2570 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 20f1cb5c8aba..5a19aeb86094 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -310,6 +310,17 @@ config SYN_COOKIES
          If unsure, say N.
+config NET_IPVTI
+        tristate "Virtual (secure) IP: tunneling"
+        select INET_TUNNEL
+        depends on INET_XFRM_MODE_TUNNEL
+        ---help---
+          Tunneling means encapsulating data of one protocol type within
+          another protocol and sending it over a channel that understands the
+          encapsulating protocol. This can be used with xfrm mode tunnel to give
+          the notion of a secure tunnel for IPSEC and then use routing protocol
+          on top.
 config INET_AH
        tristate "IP: AH transformation"
        select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..ae2ccf2890e4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y     := route.o inetpeer.o protocol.o \
             ip_output.o ip_sockglue.o inet_hashtables.o \
             inet_timewait_sock.o inet_connection_sock.o \
             tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-             tcp_minisocks.o tcp_cong.o \
+             tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
             datagram.o raw.o udp.o udplite.o \
             arp.o icmp.o devinet.o af_inet.o  igmp.o \
             fib_frontend.o fib_semantics.o fib_trie.o \
@@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
 obj-$(CONFIG_INET_ESP) += esp4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c8f7aee587d1..fe4582ca969a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -157,6 +157,7 @@ void inet_sock_destruct(struct sock *sk)
        kfree(rcu_dereference_protected(inet->inet_opt, 1));
        dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+        dst_release(sk->sk_rx_dst);
        sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -242,20 +243,18 @@ void build_ehash_secret(void)
 }
 EXPORT_SYMBOL(build_ehash_secret);
-static inline int inet_netns_ok(struct net *net, int protocol)
+static inline int inet_netns_ok(struct net *net, __u8 protocol)
 {
-        int hash;
        const struct net_protocol *ipprot;
        if (net_eq(net, &init_net))
                return 1;
-        hash = protocol & (MAX_INET_PROTOS - 1);
+        ipprot = rcu_dereference(inet_protos[protocol]);
-        ipprot = rcu_dereference(inet_protos[hash]);
+        if (ipprot == NULL) {
-        if (ipprot == NULL)
                /* raw IP is OK */
                return 1;
+        }
        return ipprot->netns_ok;
 }
@@ -553,15 +552,16 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
        if (!inet_sk(sk)->inet_num && inet_autobind(sk))
                return -EAGAIN;
-        return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+        return sk->sk_prot->connect(sk, uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo)
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
 {
        DEFINE_WAIT(wait);
        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+        sk->sk_write_pending += writebias;
        /* Basic assumption: if someone sets sk->sk_err, he _must_
         * change state of the socket from TCP_SYN_*.
@@ -577,6 +577,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
        }
        finish_wait(sk_sleep(sk), &wait);
+        sk->sk_write_pending -= writebias;
        return timeo;
 }
@@ -584,8 +585,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
 *      Connect to a remote host. There is regrettably still a little
 *      TCP 'magic' in here.
 */
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-                        int addr_len, int flags)
+                          int addr_len, int flags)
 {
        struct sock *sk = sock->sk;
        int err;
@@ -594,8 +595,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        if (addr_len < sizeof(uaddr->sa_family))
                return -EINVAL;
-        lock_sock(sk);
        if (uaddr->sa_family == AF_UNSPEC) {
                err = sk->sk_prot->disconnect(sk, flags);
                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -635,8 +634,12 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+                int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+                                tcp_sk(sk)->fastopen_req &&
+                                tcp_sk(sk)->fastopen_req->data ? 1 : 0;
                /* Error code is set above */
-                if (!timeo || !inet_wait_for_connect(sk, timeo))
+                if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
                        goto out;
                err = sock_intr_errno(timeo);
@@ -658,7 +661,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        sock->state = SS_CONNECTED;
        err = 0;
 out:
-        release_sock(sk);
        return err;
 sock_error:
@@ -668,6 +670,18 @@ sock_error:
                sock->state = SS_DISCONNECTING;
        goto out;
 }
+EXPORT_SYMBOL(__inet_stream_connect);
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+                        int addr_len, int flags)
+{
+        int err;
+        lock_sock(sock->sk);
+        err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+        release_sock(sock->sk);
+        return err;
+}
 EXPORT_SYMBOL(inet_stream_connect);
 /*
@@ -1216,8 +1230,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-        const struct iphdr *iph;
        const struct net_protocol *ops;
+        const struct iphdr *iph;
        int proto;
        int ihl;
        int err = -EINVAL;
@@ -1236,7 +1250,7 @@ static int inet_gso_send_check(struct sk_buff *skb)
        __skb_pull(skb, ihl);
        skb_reset_transport_header(skb);
        iph = ip_hdr(skb);
-        proto = iph->protocol & (MAX_INET_PROTOS - 1);
+        proto = iph->protocol;
        err = -EPROTONOSUPPORT;
        rcu_read_lock();
@@ -1253,8 +1267,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
        netdev_features_t features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
-        struct iphdr *iph;
        const struct net_protocol *ops;
+        struct iphdr *iph;
        int proto;
        int ihl;
        int id;
@@ -1286,7 +1300,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
        skb_reset_transport_header(skb);
        iph = ip_hdr(skb);
        id = ntohs(iph->id);
-        proto = iph->protocol & (MAX_INET_PROTOS - 1);
+        proto = iph->protocol;
        segs = ERR_PTR(-EPROTONOSUPPORT);
        rcu_read_lock();
@@ -1340,7 +1354,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                        goto out;
        }
-        proto = iph->protocol & (MAX_INET_PROTOS - 1);
+        proto = iph->protocol;
        rcu_read_lock();
        ops = rcu_dereference(inet_protos[proto]);
@@ -1398,11 +1412,11 @@ out:
 static int inet_gro_complete(struct sk_buff *skb)
 {
-        const struct net_protocol *ops;
+        __be16 newlen = htons(skb->len - skb_network_offset(skb));
        struct iphdr *iph = ip_hdr(skb);
-        int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+        const struct net_protocol *ops;
+        int proto = iph->protocol;
        int err = -ENOSYS;
-        __be16 newlen = htons(skb->len - skb_network_offset(skb));
        csum_replace2(&iph->check, iph->tot_len, newlen);
        iph->tot_len = newlen;
@@ -1520,14 +1534,15 @@ static const struct net_protocol igmp_protocol = {
 #endif
 static const struct net_protocol tcp_protocol = {
-        .handler =      tcp_v4_rcv,
+        .early_demux    =       tcp_v4_early_demux,
-        .err_handler =  tcp_v4_err,
+        .handler        =       tcp_v4_rcv,
-        .gso_send_check = tcp_v4_gso_send_check,
+        .err_handler    =       tcp_v4_err,
-        .gso_segment =  tcp_tso_segment,
+        .gso_send_check =       tcp_v4_gso_send_check,
-        .gro_receive =  tcp4_gro_receive,
+        .gso_segment    =       tcp_tso_segment,
-        .gro_complete = tcp4_gro_complete,
+        .gro_receive    =       tcp4_gro_receive,
-        .no_policy =    1,
+        .gro_complete   =       tcp4_gro_complete,
-        .netns_ok =     1,
+        .no_policy      =       1,
+        .netns_ok       =       1,
 };
 static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e8f2617ecd47..a0d8392491c3 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -398,16 +398,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
        struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
-        if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+        switch (icmp_hdr(skb)->type) {
-            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+        case ICMP_DEST_UNREACH:
+                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+                        return;
+        case ICMP_REDIRECT:
+                break;
+        default:
                return;
+        }
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              ah->spi, IPPROTO_AH, AF_INET);
        if (!x)
                return;
-        pr_debug("pmtu discovery on SA AH/%08x/%08x\n",
-                 ntohl(ah->spi), ntohl(iph->daddr));
+        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+        else
+                ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
        xfrm_state_put(x);
 }
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index cda37be02f8d..a0124eb7dbea 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
                return 1;
        }
-        paddr = skb_rtable(skb)->rt_gateway;
+        paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
        if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
                               paddr, dev))
                return 0;
@@ -790,7 +789,8 @@ static int arp_process(struct sk_buff *skb)
 *      Check for bad requests for 127.x.x.x and requests for multicast
 *      addresses.  If this is one such, delete it.
 */
-        if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+        if (ipv4_is_multicast(tip) ||
+            (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
                goto out;
 /*
@@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
        }
        if (arp->ar_op == htons(ARPOP_REQUEST) &&
-            ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+            ip_route_input(skb, tip, sip, 0, dev) == 0) {
                rt = skb_rtable(skb);
                addr_type = rt->rt_type;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 10e15a144e95..44bf82e3aef7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1500,7 +1500,8 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
                if (cnf == net->ipv4.devconf_dflt)
                        devinet_copy_dflt_conf(net, i);
-                if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
+                if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+                    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
                        if ((new_value == 0) && (old_value != 0))
                                rt_cache_flush(net, 0);
        }
@@ -1617,6 +1618,8 @@ static struct devinet_sysctl_table {
                                              "force_igmp_version"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
                                              "promote_secondaries"),
+                DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+                                              "route_localnet"),
        },
 };
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index cb982a61536f..b61e9deb7c7e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -484,16 +484,25 @@ static void esp4_err(struct sk_buff *skb, u32 info)
        struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
-        if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+        switch (icmp_hdr(skb)->type) {
-            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+        case ICMP_DEST_UNREACH:
+                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+                        return;
+        case ICMP_REDIRECT:
+                break;
+        default:
                return;
+        }
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              esph->spi, IPPROTO_ESP, AF_INET);
        if (!x)
                return;
-        NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
-                 ntohl(esph->spi), ntohl(iph->daddr));
+        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+        else
+                ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
        xfrm_state_put(x);
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3854411fa37c..8732cc7920ed 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -31,6 +31,7 @@
 #include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
+#include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -85,6 +86,24 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
        tb = fib_trie_table(id);
        if (!tb)
                return NULL;
+        switch (id) {
+        case RT_TABLE_LOCAL:
+                net->ipv4.fib_local = tb;
+                break;
+        case RT_TABLE_MAIN:
+                net->ipv4.fib_main = tb;
+                break;
+        case RT_TABLE_DEFAULT:
+                net->ipv4.fib_default = tb;
+                break;
+        default:
+                break;
+        }
        h = id & (FIB_TABLE_HASHSZ - 1);
        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
        return tb;
@@ -150,10 +169,6 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
        if (ipv4_is_multicast(addr))
                return RTN_MULTICAST;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-        res.r = NULL;
-#endif
        local_table = fib_get_table(net, RT_TABLE_LOCAL);
        if (local_table) {
                ret = RTN_UNICAST;
@@ -180,6 +195,44 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+        struct net_device *dev = skb->dev;
+        struct in_device *in_dev;
+        struct fib_result res;
+        struct rtable *rt;
+        struct flowi4 fl4;
+        struct net *net;
+        int scope;
+        rt = skb_rtable(skb);
+        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+            RTCF_LOCAL)
+                return ip_hdr(skb)->daddr;
+        in_dev = __in_dev_get_rcu(dev);
+        BUG_ON(!in_dev);
+        net = dev_net(dev);
+        scope = RT_SCOPE_UNIVERSE;
+        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+                fl4.flowi4_oif = 0;
+                fl4.flowi4_iif = net->loopback_dev->ifindex;
+                fl4.daddr = ip_hdr(skb)->saddr;
+                fl4.saddr = 0;
+                fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+                fl4.flowi4_scope = scope;
+                fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+                if (!fib_lookup(net, &fl4, &res))
+                        return FIB_RES_PREFSRC(net, res);
+        } else {
+                scope = RT_SCOPE_LINK;
+        }
+        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
 /* Given (packet source, input interface) and optional (dst, oif, tos):
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
@@ -188,17 +241,15 @@ EXPORT_SYMBOL(inet_dev_addr_type);
 * - check, that packet arrived from expected physical interface.
 * called with rcu_read_lock()
 */
-int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
-                        int oif, struct net_device *dev, __be32 *spec_dst,
+                                 u8 tos, int oif, struct net_device *dev,
-                        u32 *itag)
+                                 int rpf, struct in_device *idev, u32 *itag)
 {
-        struct in_device *in_dev;
+        int ret, no_addr, accept_local;
-        struct flowi4 fl4;
        struct fib_result res;
-        int no_addr, rpf, accept_local;
+        struct flowi4 fl4;
-        bool dev_match;
-        int ret;
        struct net *net;
+        bool dev_match;
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = oif;
@@ -207,20 +258,10 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
-        no_addr = rpf = accept_local = 0;
+        no_addr = idev->ifa_list == NULL;
-        in_dev = __in_dev_get_rcu(dev);
-        if (in_dev) {
-                no_addr = in_dev->ifa_list == NULL;
-                /* Ignore rp_filter for packets protected by IPsec. */
-                rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
-                accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
-                fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-        }
-        if (in_dev == NULL)
+        accept_local = IN_DEV_ACCEPT_LOCAL(idev);
-                goto e_inval;
+        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
        net = dev_net(dev);
        if (fib_lookup(net, &fl4, &res))
@@ -229,7 +270,6 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
                if (res.type != RTN_LOCAL || !accept_local)
                        goto e_inval;
        }
-        *spec_dst = FIB_RES_PREFSRC(net, res);
        fib_combine_itag(itag, &res);
        dev_match = false;
@@ -258,17 +298,14 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
        ret = 0;
        if (fib_lookup(net, &fl4, &res) == 0) {
-                if (res.type == RTN_UNICAST) {
+                if (res.type == RTN_UNICAST)
-                        *spec_dst = FIB_RES_PREFSRC(net, res);
                        ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
-                }
        }
        return ret;
 last_resort:
        if (rpf)
                goto e_rpf;
-        *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
        *itag = 0;
        return 0;
@@ -278,6 +315,20 @@ e_rpf:
        return -EXDEV;
 }
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+                        u8 tos, int oif, struct net_device *dev,
+                        struct in_device *idev, u32 *itag)
+{
+        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+        if (!r && !fib_num_tclassid_users(dev_net(dev))) {
+                *itag = 0;
+                return 0;
+        }
+        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
 static inline __be32 sk_extract_addr(struct sockaddr *addr)
 {
        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -879,10 +930,6 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
                .flowi4_scope = frn->fl_scope,
        };
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-        res.r = NULL;
-#endif
        frn->err = -ENOENT;
        if (tb) {
                local_bh_disable();
@@ -935,8 +982,11 @@ static void nl_fib_input(struct sk_buff *skb)
 static int __net_init nl_fib_lookup_init(struct net *net)
 {
        struct sock *sk;
-        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
+        struct netlink_kernel_cfg cfg = {
-                                   nl_fib_input, NULL, THIS_MODULE);
+                .input  = nl_fib_input,
+        };
+        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, THIS_MODULE, &cfg);
        if (sk == NULL)
                return -EAFNOSUPPORT;
        net->ipv4.fibnl = sk;
@@ -1021,11 +1071,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                rt_cache_flush(dev_net(dev), 0);
                break;
        case NETDEV_UNREGISTER_BATCH:
-                /* The batch unregister is only called on the first
-                 * device in the list of devices being unregistered.
-                 * Therefore we should not pass dev_net(dev) in here.
-                 */
-                rt_cache_flush_batch(NULL);
                break;
        }
        return NOTIFY_DONE;
@@ -1090,6 +1135,9 @@ static int __net_init fib_net_init(struct net *net)
 {
        int error;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+        net->ipv4.fib_num_tclassid_users = 0;
+#endif
        error = ip_fib_net_init(net);
        if (error < 0)
                goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 2d043f71ef70..a83d74e498d2 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,14 +47,7 @@ struct fib4_rule {
 #endif
 };
-#ifdef CONFIG_IP_ROUTE_CLASSID
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
-u32 fib_rules_tclass(const struct fib_result *res)
-{
-        return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
-}
-#endif
-int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 {
        struct fib_lookup_arg arg = {
                .result = res,
@@ -63,11 +56,15 @@ int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
        int err;
        err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
-        res->r = arg.rule;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+        if (arg.rule)
+                res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+        else
+                res->tclassid = 0;
+#endif
        return err;
 }
-EXPORT_SYMBOL_GPL(fib_lookup);
+EXPORT_SYMBOL_GPL(__fib_lookup);
 static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
                            int flags, struct fib_lookup_arg *arg)
@@ -169,8 +166,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                rule4->dst = nla_get_be32(tb[FRA_DST]);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-        if (tb[FRA_FLOW])
+        if (tb[FRA_FLOW]) {
                rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+                if (rule4->tclassid)
+                        net->ipv4.fib_num_tclassid_users++;
+        }
 #endif
        rule4->src_len = frh->src_len;
@@ -179,11 +179,24 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
        rule4->dstmask = inet_make_mask(rule4->dst_len);
        rule4->tos = frh->tos;
+        net->ipv4.fib_has_custom_rules = true;
        err = 0;
 errout:
        return err;
 }
+static void fib4_rule_delete(struct fib_rule *rule)
+{
+        struct net *net = rule->fr_net;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+        struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+        if (rule4->tclassid)
+                net->ipv4.fib_num_tclassid_users--;
+#endif
+        net->ipv4.fib_has_custom_rules = true;
+}
 static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                             struct nlattr **tb)
 {
@@ -256,6 +269,7 @@ static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
        .action         = fib4_rule_action,
        .match          = fib4_rule_match,
        .configure      = fib4_rule_configure,
+        .delete         = fib4_rule_delete,
        .compare        = fib4_rule_compare,
        .fill           = fib4_rule_fill,
        .default_pref   = fib_default_rule_pref,
@@ -295,6 +309,7 @@ int __net_init fib4_rules_init(struct net *net)
        if (err < 0)
                goto fail;
        net->ipv4.rules_ops = ops;
+        net->ipv4.fib_has_custom_rules = false;
        return 0;
 fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e5b7182fa099..e55171f184f9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
        },
 };
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+        int i;
+        for (i = 0; i < FNHE_HASH_SIZE; i++) {
+                struct fib_nh_exception *fnhe;
+                fnhe = rcu_dereference_protected(hash[i].chain, 1);
+                while (fnhe) {
+                        struct fib_nh_exception *next;
+                        
+                        next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+                        kfree(fnhe);
+                        fnhe = next;
+                }
+        }
+        kfree(hash);
+}
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -148,6 +169,12 @@ static void free_fib_info_rcu(struct rcu_head *head)
        change_nexthops(fi) {
                if (nexthop_nh->nh_dev)
                        dev_put(nexthop_nh->nh_dev);
+                if (nexthop_nh->nh_exceptions)
+                        free_nh_exceptions(nexthop_nh);
+                if (nexthop_nh->nh_rth_output)
+                        dst_release(&nexthop_nh->nh_rth_output->dst);
+                if (nexthop_nh->nh_rth_input)
+                        dst_release(&nexthop_nh->nh_rth_input->dst);
        } endfor_nexthops(fi);
        release_net(fi->fib_net);
@@ -163,6 +190,12 @@ void free_fib_info(struct fib_info *fi)
                return;
        }
        fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+        change_nexthops(fi) {
+                if (nexthop_nh->nh_tclassid)
+                        fi->fib_net->ipv4.fib_num_tclassid_users--;
+        } endfor_nexthops(fi);
+#endif
        call_rcu(&fi->rcu, free_fib_info_rcu);
 }
@@ -421,6 +454,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 #ifdef CONFIG_IP_ROUTE_CLASSID
                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+                        if (nexthop_nh->nh_tclassid)
+                                fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
                }
@@ -779,9 +814,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                        int type = nla_type(nla);
                        if (type) {
+                                u32 val;
                                if (type > RTAX_MAX)
                                        goto err_inval;
-                                fi->fib_metrics[type - 1] = nla_get_u32(nla);
+                                val = nla_get_u32(nla);
+                                if (type == RTAX_ADVMSS && val > 65535 - 40)
+                                        val = 65535 - 40;
+                                if (type == RTAX_MTU && val > 65535 - 15)
+                                        val = 65535 - 15;
+                                fi->fib_metrics[type - 1] = val;
                        }
                }
        }
@@ -810,6 +852,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                nh->nh_flags = cfg->fc_flags;
 #ifdef CONFIG_IP_ROUTE_CLASSID
                nh->nh_tclassid = cfg->fc_flow;
+                if (nh->nh_tclassid)
+                        fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
                nh->nh_weight = 1;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 30b88d7b4bd6..18cbc15b20d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1007,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-                tn = (struct tnode *) resize(t, (struct tnode *)tn);
+                tn = (struct tnode *)resize(t, tn);
-                tnode_put_child_reorg((struct tnode *)tp, cindex,
+                tnode_put_child_reorg(tp, cindex,
                                      (struct rt_trie_node *)tn, wasfull);
                tp = node_parent((struct rt_trie_node *) tn);
@@ -1024,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
        /* Handle last (top) tnode */
        if (IS_TNODE(tn))
-                tn = (struct tnode *)resize(t, (struct tnode *)tn);
+                tn = (struct tnode *)resize(t, tn);
        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
        tnode_free_flush();
@@ -1125,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
                node_set_parent((struct rt_trie_node *)l, tp);
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-                put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
+                put_child(t, tp, cindex, (struct rt_trie_node *)l);
        } else {
                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
                /*
@@ -1160,8 +1160,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
                if (tp) {
                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-                        put_child(t, (struct tnode *)tp, cindex,
+                        put_child(t, tp, cindex, (struct rt_trie_node *)tn);
-                                  (struct rt_trie_node *)tn);
                } else {
                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
                        tp = tn;
@@ -1620,7 +1619,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
        if (tp) {
                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
-                put_child(t, (struct tnode *)tp, cindex, NULL);
+                put_child(t, tp, cindex, NULL);
                trie_rebalance(t, tp);
        } else
                RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c75efbdc71cb..f2eccd531746 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -95,6 +95,7 @@
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
+#include <net/ip_fib.h>
 /*
 *      Build xmit assembly blocks
@@ -253,10 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
        /* Limit if icmp type is enabled in ratemask. */
        if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
-                if (!rt->peer)
+                struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
-                        rt_bind_peer(rt, fl4->daddr, 1);
+                rc = inet_peer_xrlim_allow(peer,
-                rc = inet_peer_xrlim_allow(rt->peer,
                                           net->ipv4.sysctl_icmp_ratelimit);
+                inet_putpeer(peer);
        }
 out:
        return rc;
@@ -334,7 +335,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        struct flowi4 fl4;
        struct sock *sk;
        struct inet_sock *inet;
-        __be32 daddr;
+        __be32 daddr, saddr;
        if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
                return;
@@ -348,6 +349,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        inet->tos = ip_hdr(skb)->tos;
        daddr = ipc.addr = ip_hdr(skb)->saddr;
+        saddr = fib_compute_spec_dst(skb);
        ipc.opt = NULL;
        ipc.tx_flags = 0;
        if (icmp_param->replyopts.opt.opt.optlen) {
@@ -357,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        }
        memset(&fl4, 0, sizeof(fl4));
        fl4.daddr = daddr;
-        fl4.saddr = rt->rt_spec_dst;
+        fl4.saddr = saddr;
        fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
        fl4.flowi4_proto = IPPROTO_ICMP;
        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -569,7 +571,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
                rcu_read_lock();
                if (rt_is_input_route(rt) &&
                    net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
-                        dev = dev_get_by_index_rcu(net, rt->rt_iif);
+                        dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
                if (dev)
                        saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -632,6 +634,27 @@ out:;
 EXPORT_SYMBOL(icmp_send);
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+        const struct iphdr *iph = (const struct iphdr *) skb->data;
+        const struct net_protocol *ipprot;
+        int protocol = iph->protocol;
+        /* Checkin full IP header plus 8 bytes of protocol to
+         * avoid additional coding at protocol handlers.
+         */
+        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+                return;
+        raw_icmp_error(skb, protocol, info);
+        rcu_read_lock();
+        ipprot = rcu_dereference(inet_protos[protocol]);
+        if (ipprot && ipprot->err_handler)
+                ipprot->err_handler(skb, info);
+        rcu_read_unlock();
+}
 /*
 *      Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
 */
@@ -640,10 +663,8 @@ static void icmp_unreach(struct sk_buff *skb)
 {
        const struct iphdr *iph;
        struct icmphdr *icmph;
-        int hash, protocol;
-        const struct net_protocol *ipprot;
-        u32 info = 0;
        struct net *net;
+        u32 info = 0;
        net = dev_net(skb_dst(skb)->dev);
@@ -674,9 +695,7 @@ static void icmp_unreach(struct sk_buff *skb)
                                LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
                                               &iph->daddr);
                        } else {
-                                info = ip_rt_frag_needed(net, iph,
+                                info = ntohs(icmph->un.frag.mtu);
-                                                         ntohs(icmph->un.frag.mtu),
-                                                         skb->dev);
                                if (!info)
                                        goto out;
                        }
@@ -720,26 +739,7 @@ static void icmp_unreach(struct sk_buff *skb)
                goto out;
        }
-        /* Checkin full IP header plus 8 bytes of protocol to
+        icmp_socket_deliver(skb, info);
-         * avoid additional coding at protocol handlers.
-         */
-        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
-                goto out;
-        iph = (const struct iphdr *)skb->data;
-        protocol = iph->protocol;
-        /*
-         *      Deliver ICMP message to raw sockets. Pretty useless feature?
-         */
-        raw_icmp_error(skb, protocol, info);
-        hash = protocol & (MAX_INET_PROTOS - 1);
-        rcu_read_lock();
-        ipprot = rcu_dereference(inet_protos[hash]);
-        if (ipprot && ipprot->err_handler)
-                ipprot->err_handler(skb, info);
-        rcu_read_unlock();
 out:
        return;
@@ -755,46 +755,15 @@ out_err:
 static void icmp_redirect(struct sk_buff *skb)
 {
-        const struct iphdr *iph;
+        if (skb->len < sizeof(struct iphdr)) {
+                ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
-        if (skb->len < sizeof(struct iphdr))
+                return;
-                goto out_err;
-        /*
-         *      Get the copied header of the packet that caused the redirect
-         */
-        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-                goto out;
-        iph = (const struct iphdr *)skb->data;
-        switch (icmp_hdr(skb)->code & 7) {
-        case ICMP_REDIR_NET:
-        case ICMP_REDIR_NETTOS:
-                /*
-                 * As per RFC recommendations now handle it as a host redirect.
-                 */
-        case ICMP_REDIR_HOST:
-        case ICMP_REDIR_HOSTTOS:
-                ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
-                               icmp_hdr(skb)->un.gateway,
-                               iph->saddr, skb->dev);
-                break;
        }
-        /* Ping wants to see redirects.
+        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-         * Let's pretend they are errors of sorts... */
+                return;
-        if (iph->protocol == IPPROTO_ICMP &&
-            iph->ihl >= 5 &&
-            pskb_may_pull(skb, (iph->ihl<<2)+8)) {
-                ping_err(skb, icmp_hdr(skb)->un.gateway);
-        }
-out:
+        icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
-        return;
-out_err:
-        ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
-        goto out;
 }
 /*
@@ -868,86 +837,6 @@ out_err:
        goto out;
 }
-/*
- *      Handle ICMP_ADDRESS_MASK requests.  (RFC950)
- *
- * RFC1122 (3.2.2.9).  A host MUST only send replies to
- * ADDRESS_MASK requests if it's been configured as an address mask
- * agent.  Receiving a request doesn't constitute implicit permission to
- * act as one. Of course, implementing this correctly requires (SHOULD)
- * a way to turn the functionality on and off.  Another one for sysctl(),
- * I guess. -- MS
- *
- * RFC1812 (4.3.3.9).   A router MUST implement it.
- *                      A router SHOULD have switch turning it on/off.
- *                      This switch MUST be ON by default.
- *
- * Gratuitous replies, zero-source replies are not implemented,
- * that complies with RFC. DO NOT implement them!!! All the idea
- * of broadcast addrmask replies as specified in RFC950 is broken.
- * The problem is that it is not uncommon to have several prefixes
- * on one physical interface. Moreover, addrmask agent can even be
- * not aware of existing another prefixes.
- * If source is zero, addrmask agent cannot choose correct prefix.
- * Gratuitous mask announcements suffer from the same problem.
- * RFC1812 explains it, but still allows to use ADDRMASK,
- * that is pretty silly. --ANK
- *
- * All these rules are so bizarre, that I removed kernel addrmask
- * support at all. It is wrong, it is obsolete, nobody uses it in
- * any case. --ANK
- *
- * Furthermore you can do it with a usermode address agent program
- * anyway...
- */
-static void icmp_address(struct sk_buff *skb)
-{
-#if 0
-        net_dbg_ratelimited("a guy asks for address mask. Who is it?\n");
-#endif
-}
-/*
- * RFC1812 (4.3.3.9).   A router SHOULD listen all replies, and complain
- *                      loudly if an inconsistency is found.
- * called with rcu_read_lock()
- */
-static void icmp_address_reply(struct sk_buff *skb)
-{
-        struct rtable *rt = skb_rtable(skb);
-        struct net_device *dev = skb->dev;
-        struct in_device *in_dev;
-        struct in_ifaddr *ifa;
-        if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
-                return;
-        in_dev = __in_dev_get_rcu(dev);
-        if (!in_dev)
-                return;
-        if (in_dev->ifa_list &&
-            IN_DEV_LOG_MARTIANS(in_dev) &&
-            IN_DEV_FORWARD(in_dev)) {
-                __be32 _mask, *mp;
-                mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
-                BUG_ON(mp == NULL);
-                for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-                        if (*mp == ifa->ifa_mask &&
-                            inet_ifa_match(ip_hdr(skb)->saddr, ifa))
-                                break;
-                }
-                if (!ifa)
-                        net_info_ratelimited("Wrong address mask %pI4 from %s/%pI4\n",
-                                             mp,
-                                             dev->name, &ip_hdr(skb)->saddr);
-        }
-}
 static void icmp_discard(struct sk_buff *skb)
 {
 }
@@ -1111,10 +1000,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
                .handler = icmp_discard,
        },
        [ICMP_ADDRESS] = {
-                .handler = icmp_address,
+                .handler = icmp_discard,
        },
        [ICMP_ADDRESSREPLY] = {
-                .handler = icmp_address_reply,
+                .handler = icmp_discard,
        },
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f9ee7417f6a0..db0cf17c00f7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -374,18 +374,19 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct ip_options_rcu *opt = inet_rsk(req)->opt;
        struct net *net = sock_net(sk);
+        int flags = inet_sk_flowi_flags(sk);
        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
                           sk->sk_protocol,
-                           inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS,
+                           flags,
                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
                           ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
        security_req_classify_flow(req, flowi4_to_flowi(fl4));
        rt = ip_route_output_flow(net, fl4, sk);
        if (IS_ERR(rt))
                goto no_route;
-        if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+        if (opt && opt->opt.is_strictroute && rt->rt_gateway)
                goto route_err;
        return &rt->dst;
@@ -418,7 +419,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
        rt = ip_route_output_flow(net, fl4, sk);
        if (IS_ERR(rt))
                goto no_route;
-        if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+        if (opt && opt->opt.is_strictroute && rt->rt_gateway)
                goto route_err;
        return &rt->dst;
@@ -799,3 +800,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 }
 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 #endif
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+        const struct inet_sock *inet = inet_sk(sk);
+        const struct ip_options_rcu *inet_opt;
+        __be32 daddr = inet->inet_daddr;
+        struct flowi4 *fl4;
+        struct rtable *rt;
+        rcu_read_lock();
+        inet_opt = rcu_dereference(inet->inet_opt);
+        if (inet_opt && inet_opt->opt.srr)
+                daddr = inet_opt->opt.faddr;
+        fl4 = &fl->u.ip4;
+        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+                                   inet->inet_saddr, inet->inet_dport,
+                                   inet->inet_sport, sk->sk_protocol,
+                                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+        if (IS_ERR(rt))
+                rt = NULL;
+        if (rt)
+                sk_setup_caps(sk, &rt->dst);
+        rcu_read_unlock();
+        return &rt->dst;
+}
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+        struct dst_entry *dst = __sk_dst_check(sk, 0);
+        struct inet_sock *inet = inet_sk(sk);
+        if (!dst) {
+                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+                if (!dst)
+                        goto out;
+        }
+        dst->ops->update_pmtu(dst, sk, NULL, mtu);
+        dst = __sk_dst_check(sk, 0);
+        if (!dst)
+                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+        return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 46d1e7199a8c..570e61f9611f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -46,9 +46,6 @@ struct inet_diag_entry {
        u16 userlocks;
 };
-#define INET_DIAG_PUT(skb, attrtype, attrlen) \
-        RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
 static DEFINE_MUTEX(inet_diag_table_mutex);
 static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
@@ -78,24 +75,22 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
        const struct inet_sock *inet = inet_sk(sk);
        struct inet_diag_msg *r;
        struct nlmsghdr  *nlh;
+        struct nlattr *attr;
        void *info = NULL;
-        struct inet_diag_meminfo  *minfo = NULL;
-        unsigned char    *b = skb_tail_pointer(skb);
        const struct inet_diag_handler *handler;
        int ext = req->idiag_ext;
        handler = inet_diag_table[req->sdiag_protocol];
        BUG_ON(handler == NULL);
-        nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+        nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
-        nlh->nlmsg_flags = nlmsg_flags;
+                        nlmsg_flags);
+        if (!nlh)
+                return -EMSGSIZE;
-        r = NLMSG_DATA(nlh);
+        r = nlmsg_data(nlh);
        BUG_ON(sk->sk_state == TCP_TIME_WAIT);
-        if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
-                minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
        r->idiag_family = sk->sk_family;
        r->idiag_state = sk->sk_state;
        r->idiag_timer = 0;
@@ -113,7 +108,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
         * hence this needs to be included regardless of socket family.
         */
        if (ext & (1 << (INET_DIAG_TOS - 1)))
-                RTA_PUT_U8(skb, INET_DIAG_TOS, inet->tos);
+                if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+                        goto errout;
 #if IS_ENABLED(CONFIG_IPV6)
        if (r->idiag_family == AF_INET6) {
@@ -121,24 +117,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
                *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
                *(struct in6_addr *)r->id.idiag_dst = np->daddr;
                if (ext & (1 << (INET_DIAG_TCLASS - 1)))
-                        RTA_PUT_U8(skb, INET_DIAG_TCLASS, np->tclass);
+                        if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
+                                goto errout;
        }
 #endif
        r->idiag_uid = sock_i_uid(sk);
        r->idiag_inode = sock_i_ino(sk);
-        if (minfo) {
+        if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
-                minfo->idiag_rmem = sk_rmem_alloc_get(sk);
+                struct inet_diag_meminfo minfo = {
-                minfo->idiag_wmem = sk->sk_wmem_queued;
+                        .idiag_rmem = sk_rmem_alloc_get(sk),
-                minfo->idiag_fmem = sk->sk_forward_alloc;
+                        .idiag_wmem = sk->sk_wmem_queued,
-                minfo->idiag_tmem = sk_wmem_alloc_get(sk);
+                        .idiag_fmem = sk->sk_forward_alloc,
+                        .idiag_tmem = sk_wmem_alloc_get(sk),
+                };
+                if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+                        goto errout;
        }
        if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
                if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
-                        goto rtattr_failure;
+                        goto errout;
        if (icsk == NULL) {
                handler->idiag_get_info(sk, r, NULL);
@@ -165,16 +168,20 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
        }
 #undef EXPIRES_IN_MS
-        if (ext & (1 << (INET_DIAG_INFO - 1)))
+        if (ext & (1 << (INET_DIAG_INFO - 1))) {
-                info = INET_DIAG_PUT(skb, INET_DIAG_INFO, sizeof(struct tcp_info));
+                attr = nla_reserve(skb, INET_DIAG_INFO,
+                                   sizeof(struct tcp_info));
-        if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+                if (!attr)
-                const size_t len = strlen(icsk->icsk_ca_ops->name);
+                        goto errout;
-                strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+                info = nla_data(attr);
-                       icsk->icsk_ca_ops->name);
        }
+        if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
+                if (nla_put_string(skb, INET_DIAG_CONG,
+                                   icsk->icsk_ca_ops->name) < 0)
+                        goto errout;
        handler->idiag_get_info(sk, r, info);
        if (sk->sk_state < TCP_TIME_WAIT &&
@@ -182,12 +189,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
                icsk->icsk_ca_ops->get_info(sk, ext, skb);
 out:
-        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+        return nlmsg_end(skb, nlh);
-        return skb->len;
-rtattr_failure:
+errout:
-nlmsg_failure:
+        nlmsg_cancel(skb, nlh);
-        nlmsg_trim(skb, b);
        return -EMSGSIZE;
 }
 EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
@@ -208,14 +213,15 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
 {
        long tmo;
        struct inet_diag_msg *r;
-        const unsigned char *previous_tail = skb_tail_pointer(skb);
+        struct nlmsghdr *nlh;
-        struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
-                                         unlh->nlmsg_type, sizeof(*r));
-        r = NLMSG_DATA(nlh);
+        nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
-        BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+                        nlmsg_flags);
+        if (!nlh)
+                return -EMSGSIZE;
-        nlh->nlmsg_flags = nlmsg_flags;
+        r = nlmsg_data(nlh);
+        BUG_ON(tw->tw_state != TCP_TIME_WAIT);
        tmo = tw->tw_ttd - jiffies;
        if (tmo < 0)
@@ -245,11 +251,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
                *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
        }
 #endif
-        nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
-        return skb->len;
+        return nlmsg_end(skb, nlh);
-nlmsg_failure:
-        nlmsg_trim(skb, previous_tail);
-        return -EMSGSIZE;
 }
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -269,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
        int err;
        struct sock *sk;
        struct sk_buff *rep;
+        struct net *net = sock_net(in_skb->sk);
        err = -EINVAL;
        if (req->sdiag_family == AF_INET) {
-                sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
+                sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
                                 req->id.idiag_dport, req->id.idiag_src[0],
                                 req->id.idiag_sport, req->id.idiag_if);
        }
 #if IS_ENABLED(CONFIG_IPV6)
        else if (req->sdiag_family == AF_INET6) {
-                sk = inet6_lookup(&init_net, hashinfo,
+                sk = inet6_lookup(net, hashinfo,
                                  (struct in6_addr *)req->id.idiag_dst,
                                  req->id.idiag_dport,
                                  (struct in6_addr *)req->id.idiag_src,
@@ -298,23 +302,23 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
        if (err)
                goto out;
-        err = -ENOMEM;
+        rep = nlmsg_new(sizeof(struct inet_diag_msg) +
-        rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+                        sizeof(struct inet_diag_meminfo) +
-                                     sizeof(struct inet_diag_meminfo) +
+                        sizeof(struct tcp_info) + 64, GFP_KERNEL);
-                                     sizeof(struct tcp_info) + 64)),
+        if (!rep) {
-                        GFP_KERNEL);
+                err = -ENOMEM;
-        if (!rep)
                goto out;
+        }
        err = sk_diag_fill(sk, rep, req,
                           NETLINK_CB(in_skb).pid,
                           nlh->nlmsg_seq, 0, nlh);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
-                kfree_skb(rep);
+                nlmsg_free(rep);
                goto out;
        }
-        err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
                              MSG_DONTWAIT);
        if (err > 0)
                err = 0;
@@ -592,15 +596,16 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct inet_sock *inet = inet_sk(sk);
-        unsigned char *b = skb_tail_pointer(skb);
        struct inet_diag_msg *r;
        struct nlmsghdr *nlh;
        long tmo;
-        nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+        nlh = nlmsg_put(skb, pid, seq, unlh->nlmsg_type, sizeof(*r),
-        nlh->nlmsg_flags = NLM_F_MULTI;
+                        NLM_F_MULTI);
-        r = NLMSG_DATA(nlh);
+        if (!nlh)
+                return -EMSGSIZE;
+        r = nlmsg_data(nlh);
        r->idiag_family = sk->sk_family;
        r->idiag_state = TCP_SYN_RECV;
        r->idiag_timer = 1;
@@ -628,13 +633,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
                *(struct in6_addr *)r->id.idiag_dst = inet6_rsk(req)->rmt_addr;
        }
 #endif
-        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-        return skb->len;
-nlmsg_failure:
+        return nlmsg_end(skb, nlh);
-        nlmsg_trim(skb, b);
-        return -1;
 }
 static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
@@ -725,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 {
        int i, num;
        int s_i, s_num;
+        struct net *net = sock_net(skb->sk);
        s_i = cb->args[1];
        s_num = num = cb->args[2];
@@ -744,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
                        sk_nulls_for_each(sk, node, &ilb->head) {
                                struct inet_sock *inet = inet_sk(sk);
+                                if (!net_eq(sock_net(sk), net))
+                                        continue;
                                if (num < s_num) {
                                        num++;
                                        continue;
@@ -814,6 +818,8 @@ skip_listen_ht:
                sk_nulls_for_each(sk, node, &head->chain) {
                        struct inet_sock *inet = inet_sk(sk);
+                        if (!net_eq(sock_net(sk), net))
+                                continue;
                        if (num < s_num)
                                goto next_normal;
                        if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -840,6 +846,8 @@ next_normal:
                        inet_twsk_for_each(tw, node,
                                    &head->twchain) {
+                                if (!net_eq(twsk_net(tw), net))
+                                        continue;
                                if (num < s_num)
                                        goto next_dying;
@@ -892,7 +900,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
        if (nlmsg_attrlen(cb->nlh, hdrlen))
                bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-        return __inet_diag_dump(skb, cb, (struct inet_diag_req_v2 *)NLMSG_DATA(cb->nlh), bc);
+        return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
 }
 static inline int inet_diag_type2proto(int type)
@@ -909,7 +917,7 @@ static inline int inet_diag_type2proto(int type)
 static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
 {
-        struct inet_diag_req *rc = NLMSG_DATA(cb->nlh);
+        struct inet_diag_req *rc = nlmsg_data(cb->nlh);
        struct inet_diag_req_v2 req;
        struct nlattr *bc = NULL;
        int hdrlen = sizeof(struct inet_diag_req);
@@ -929,7 +937,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
 static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
                               const struct nlmsghdr *nlh)
 {
-        struct inet_diag_req *rc = NLMSG_DATA(nlh);
+        struct inet_diag_req *rc = nlmsg_data(nlh);
        struct inet_diag_req_v2 req;
        req.sdiag_family = rc->idiag_family;
@@ -944,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
 static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
        int hdrlen = sizeof(struct inet_diag_req);
+        struct net *net = sock_net(skb->sk);
        if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
            nlmsg_len(nlh) < hdrlen)
@@ -964,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
                        struct netlink_dump_control c = {
                                .dump = inet_diag_dump_compat,
                        };
-                        return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c);
+                        return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
                }
        }
@@ -974,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 {
        int hdrlen = sizeof(struct inet_diag_req_v2);
+        struct net *net = sock_net(skb->sk);
        if (nlmsg_len(h) < hdrlen)
                return -EINVAL;
@@ -992,11 +1002,11 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
                        struct netlink_dump_control c = {
                                .dump = inet_diag_dump,
                        };
-                        return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+                        return netlink_dump_start(net->diag_nlsk, skb, h, &c);
                }
        }
-        return inet_diag_get_exact(skb, h, (struct inet_diag_req_v2 *)NLMSG_DATA(h));
+        return inet_diag_get_exact(skb, h, nlmsg_data(h));
 }
 static const struct sock_diag_handler inet_diag_handler = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5ff2a51b6d0c..85190e69297b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -243,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
        if (q == NULL)
                return NULL;
+        q->net = nf;
        f->constructor(q, arg);
        atomic_add(f->qsize, &nf->mem);
        setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
        spin_lock_init(&q->lock);
        atomic_set(&q->refcnt, 1);
-        q->net = nf;
        return q;
 }
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index dfba343b2509..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -82,23 +82,39 @@ static const struct inet_peer peer_fake_node = {
        .avl_height     = 0
 };
-struct inet_peer_base {
+void inet_peer_base_init(struct inet_peer_base *bp)
-        struct inet_peer __rcu *root;
+{
-        seqlock_t       lock;
+        bp->root = peer_avl_empty_rcu;
-        int             total;
+        seqlock_init(&bp->lock);
-};
+        bp->flush_seq = ~0U;
+        bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
-static struct inet_peer_base v4_peers = {
+static atomic_t v4_seq = ATOMIC_INIT(0);
-        .root           = peer_avl_empty_rcu,
+static atomic_t v6_seq = ATOMIC_INIT(0);
-        .lock           = __SEQLOCK_UNLOCKED(v4_peers.lock),
-        .total          = 0,
-};
-static struct inet_peer_base v6_peers = {
+static atomic_t *inetpeer_seq_ptr(int family)
-        .root           = peer_avl_empty_rcu,
+{
-        .lock           = __SEQLOCK_UNLOCKED(v6_peers.lock),
+        return (family == AF_INET ? &v4_seq : &v6_seq);
-        .total          = 0,
+}
-};
+static inline void flush_check(struct inet_peer_base *base, int family)
+{
+        atomic_t *fp = inetpeer_seq_ptr(family);
+        if (unlikely(base->flush_seq != atomic_read(fp))) {
+                inetpeer_invalidate_tree(base);
+                base->flush_seq = atomic_read(fp);
+        }
+}
+void inetpeer_invalidate_family(int family)
+{
+        atomic_t *fp = inetpeer_seq_ptr(family);
+        atomic_inc(fp);
+}
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
@@ -110,7 +126,7 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min
 static void inetpeer_gc_worker(struct work_struct *work)
 {
-        struct inet_peer *p, *n;
+        struct inet_peer *p, *n, *c;
        LIST_HEAD(list);
        spin_lock_bh(&gc_lock);
@@ -122,17 +138,19 @@ static void inetpeer_gc_worker(struct work_struct *work)
        list_for_each_entry_safe(p, n, &list, gc_list) {
-                if(need_resched())
+                if (need_resched())
                        cond_resched();
-                if (p->avl_left != peer_avl_empty) {
+                c = rcu_dereference_protected(p->avl_left, 1);
-                        list_add_tail(&p->avl_left->gc_list, &list);
+                if (c != peer_avl_empty) {
-                        p->avl_left = peer_avl_empty;
+                        list_add_tail(&c->gc_list, &list);
+                        p->avl_left = peer_avl_empty_rcu;
                }
-                if (p->avl_right != peer_avl_empty) {
+                c = rcu_dereference_protected(p->avl_right, 1);
-                        list_add_tail(&p->avl_right->gc_list, &list);
+                if (c != peer_avl_empty) {
-                        p->avl_right = peer_avl_empty;
+                        list_add_tail(&c->gc_list, &list);
+                        p->avl_right = peer_avl_empty_rcu;
                }
                n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
@@ -401,11 +419,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
        call_rcu(&p->rcu, inetpeer_free_rcu);
 }
-static struct inet_peer_base *family_to_base(int family)
-{
-        return family == AF_INET ? &v4_peers : &v6_peers;
-}
 /* perform garbage collect on all items stacked during a lookup */
 static int inet_peer_gc(struct inet_peer_base *base,
                        struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -443,14 +456,17 @@ static int inet_peer_gc(struct inet_peer_base *base,
        return cnt;
 }
-struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+                               const struct inetpeer_addr *daddr,
+                               int create)
 {
        struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
-        struct inet_peer_base *base = family_to_base(daddr->family);
        struct inet_peer *p;
        unsigned int sequence;
        int invalidated, gccnt = 0;
+        flush_check(base, daddr->family);
        /* Attempt a lockless lookup first.
         * Because of a concurrent writer, we might not find an existing entry.
         */
@@ -492,13 +508,9 @@ relookup:
                                (daddr->family == AF_INET) ?
                                        secure_ip_id(daddr->addr.a4) :
                                        secure_ipv6_id(daddr->addr.a6));
-                p->tcp_ts_stamp = 0;
                p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
                p->rate_tokens = 0;
                p->rate_last = 0;
-                p->pmtu_expires = 0;
-                p->pmtu_orig = 0;
-                memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
                INIT_LIST_HEAD(&p->gc_list);
                /* Link the node. */
@@ -571,26 +583,19 @@ static void inetpeer_inval_rcu(struct rcu_head *head)
        schedule_delayed_work(&gc_work, gc_delay);
 }
-void inetpeer_invalidate_tree(int family)
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
 {
-        struct inet_peer *old, *new, *prev;
+        struct inet_peer *root;
-        struct inet_peer_base *base = family_to_base(family);
        write_seqlock_bh(&base->lock);
-        old = base->root;
+        root = rcu_deref_locked(base->root, base);
-        if (old == peer_avl_empty_rcu)
+        if (root != peer_avl_empty) {
-                goto out;
+                base->root = peer_avl_empty_rcu;
-        new = peer_avl_empty_rcu;
-        prev = cmpxchg(&base->root, old, new);
-        if (prev == old) {
                base->total = 0;
-                call_rcu(&prev->gc_rcu, inetpeer_inval_rcu);
+                call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
        }
-out:
        write_sequnlock_bh(&base->lock);
 }
 EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9dbd3dd6022d..7ad88e5e7110 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -171,6 +171,10 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
 static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 {
        struct ipq *qp = container_of(q, struct ipq, q);
+        struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+                                               frags);
+        struct net *net = container_of(ipv4, struct net, ipv4);
        struct ip4_create_arg *arg = a;
        qp->protocol = arg->iph->protocol;
@@ -180,7 +184,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
        qp->daddr = arg->iph->daddr;
        qp->user = arg->user;
        qp->peer = sysctl_ipfrag_max_dist ?
-                inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
+                inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
 }
 static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -254,8 +258,8 @@ static void ip_expire(unsigned long arg)
                /* skb dst is stale, drop it, and perform route lookup again */
                skb_dst_drop(head);
                iph = ip_hdr(head);
-                err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+                err = ip_route_input(head, iph->daddr, iph->saddr,
-                                           iph->tos, head->dev);
+                                     iph->tos, head->dev);
                if (err)
                        goto out_rcu_unlock;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f49047b79609..b062a98574f2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -516,9 +516,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
                case ICMP_PORT_UNREACH:
                        /* Impossible event. */
                        return;
-                case ICMP_FRAG_NEEDED:
-                        /* Soft state for pmtu is maintained by IP core. */
-                        return;
                default:
                        /* All others are translated to HOST_UNREACH.
                           rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -531,6 +528,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
                if (code != ICMP_EXC_TTL)
                        return;
                break;
+        case ICMP_REDIRECT:
+                break;
        }
        rcu_read_lock();
@@ -538,7 +538,20 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
                                flags & GRE_KEY ?
                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
                                p[1]);
-        if (t == NULL || t->parms.iph.daddr == 0 ||
+        if (t == NULL)
+                goto out;
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+                                 t->parms.link, 0, IPPROTO_GRE, 0);
+                goto out;
+        }
+        if (type == ICMP_REDIRECT) {
+                ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+                              IPPROTO_GRE, 0);
+                goto out;
+        }
+        if (t->parms.iph.daddr == 0 ||
            ipv4_is_multicast(t->parms.iph.daddr))
                goto out;
@@ -753,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                if (skb->protocol == htons(ETH_P_IP)) {
                        rt = skb_rtable(skb);
-                        dst = rt->rt_gateway;
+                        dst = rt_nexthop(rt, old_iph->daddr);
                }
 #if IS_ENABLED(CONFIG_IPV6)
                else if (skb->protocol == htons(ETH_P_IPV6)) {
@@ -820,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
        if (skb_dst(skb))
-                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
        if (skb->protocol == htons(ETH_P_IP)) {
                df |= (old_iph->frag_off&htons(IP_DF));
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 8590144ca330..4ebc6feee250 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -198,14 +198,13 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
        rcu_read_lock();
        {
                int protocol = ip_hdr(skb)->protocol;
-                int hash, raw;
                const struct net_protocol *ipprot;
+                int raw;
        resubmit:
                raw = raw_local_deliver(skb, protocol);
-                hash = protocol & (MAX_INET_PROTOS - 1);
+                ipprot = rcu_dereference(inet_protos[protocol]);
-                ipprot = rcu_dereference(inet_protos[hash]);
                if (ipprot != NULL) {
                        int ret;
@@ -314,26 +313,33 @@ drop:
        return true;
 }
+int sysctl_ip_early_demux __read_mostly = 1;
 static int ip_rcv_finish(struct sk_buff *skb)
 {
        const struct iphdr *iph = ip_hdr(skb);
        struct rtable *rt;
+        if (sysctl_ip_early_demux && !skb_dst(skb)) {
+                const struct net_protocol *ipprot;
+                int protocol = iph->protocol;
+                rcu_read_lock();
+                ipprot = rcu_dereference(inet_protos[protocol]);
+                if (ipprot && ipprot->early_demux)
+                        ipprot->early_demux(skb);
+                rcu_read_unlock();
+        }
        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
         */
-        if (skb_dst(skb) == NULL) {
+        if (!skb_dst(skb)) {
-                int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+                int err = ip_route_input(skb, iph->daddr, iph->saddr,
-                                               iph->tos, skb->dev);
+                                         iph->tos, skb->dev);
                if (unlikely(err)) {
-                        if (err == -EHOSTUNREACH)
+                        if (err == -EXDEV)
-                                IP_INC_STATS_BH(dev_net(skb->dev),
-                                                IPSTATS_MIB_INADDRERRORS);
-                        else if (err == -ENETUNREACH)
-                                IP_INC_STATS_BH(dev_net(skb->dev),
-                                                IPSTATS_MIB_INNOROUTES);
-                        else if (err == -EXDEV)
                                NET_INC_STATS_BH(dev_net(skb->dev),
                                                 LINUX_MIB_IPRPFILTER);
                        goto drop;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 708b99494e23..1dc01f9793d5 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -27,6 +27,7 @@
 #include <net/icmp.h>
 #include <net/route.h>
 #include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
 /*
 * Write options to IP header, record destination address to
@@ -92,7 +93,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
        unsigned char *sptr, *dptr;
        int soffset, doffset;
        int     optlen;
-        __be32  daddr;
        memset(dopt, 0, sizeof(struct ip_options));
@@ -104,8 +104,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
        sptr = skb_network_header(skb);
        dptr = dopt->__data;
-        daddr = skb_rtable(skb)->rt_spec_dst;
        if (sopt->rr) {
                optlen  = sptr[sopt->rr+1];
                soffset = sptr[sopt->rr+2];
@@ -179,6 +177,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
                                doffset -= 4;
                }
                if (doffset > 3) {
+                        __be32 daddr = fib_compute_spec_dst(skb);
                        memcpy(&start[doffset-1], &daddr, 4);
                        dopt->faddr = faddr;
                        dptr[0] = start[0];
@@ -241,6 +241,15 @@ void ip_options_fragment(struct sk_buff *skb)
        opt->ts_needtime = 0;
 }
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+        if (*spec_dst == htonl(INADDR_ANY))
+                *spec_dst = fib_compute_spec_dst(skb);
+}
 /*
 * Verify options and fill pointers in struct options.
 * Caller should clear *opt, and set opt->data.
@@ -250,12 +259,12 @@ void ip_options_fragment(struct sk_buff *skb)
 int ip_options_compile(struct net *net,
                       struct ip_options *opt, struct sk_buff *skb)
 {
-        int l;
+        __be32 spec_dst = htonl(INADDR_ANY);
-        unsigned char *iph;
-        unsigned char *optptr;
-        int optlen;
        unsigned char *pp_ptr = NULL;
        struct rtable *rt = NULL;
+        unsigned char *optptr;
+        unsigned char *iph;
+        int optlen, l;
        if (skb != NULL) {
                rt = skb_rtable(skb);
@@ -331,7 +340,8 @@ int ip_options_compile(struct net *net,
                                        goto error;
                                }
                                if (rt) {
-                                        memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+                                        spec_dst_fill(&spec_dst, skb);
+                                        memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
                                        opt->is_changed = 1;
                                }
                                optptr[2] += 4;
@@ -373,7 +383,8 @@ int ip_options_compile(struct net *net,
                                        }
                                        opt->ts = optptr - iph;
                                        if (rt)  {
-                                                memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+                                                spec_dst_fill(&spec_dst, skb);
+                                                memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
                                                timeptr = &optptr[optptr[2]+3];
                                        }
                                        opt->ts_needaddr = 1;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 451f97c42eb4..ba39a52d18c1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip_local_out);
-/* dev_loopback_xmit for use with netfilter. */
-static int ip_dev_loopback_xmit(struct sk_buff *newskb)
-{
-        skb_reset_mac_header(newskb);
-        __skb_pull(newskb, skb_network_offset(newskb));
-        newskb->pkt_type = PACKET_LOOPBACK;
-        newskb->ip_summed = CHECKSUM_UNNECESSARY;
-        WARN_ON(!skb_dst(newskb));
-        skb_dst_force(newskb);
-        netif_rx_ni(newskb);
-        return 0;
-}
 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 {
        int ttl = inet->uc_ttl;
@@ -183,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
        struct net_device *dev = dst->dev;
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
+        u32 nexthop;
        if (rt->rt_type == RTN_MULTICAST) {
                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -200,19 +188,22 @@ static inline int ip_finish_output2(struct sk_buff *skb)
                }
                if (skb->sk)
                        skb_set_owner_w(skb2, skb->sk);
-                kfree_skb(skb);
+                consume_skb(skb);
                skb = skb2;
        }
-        rcu_read_lock();
+        rcu_read_lock_bh();
-        neigh = dst_get_neighbour_noref(dst);
+        nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+        if (unlikely(!neigh))
+                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
        if (neigh) {
-                int res = neigh_output(neigh, skb);
+                int res = dst_neigh_output(dst, neigh, skb);
-                rcu_read_unlock();
+                rcu_read_unlock_bh();
                return res;
        }
-        rcu_read_unlock();
+        rcu_read_unlock_bh();
        net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
                            __func__);
@@ -281,7 +272,7 @@ int ip_mc_output(struct sk_buff *skb)
                        if (newskb)
                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                        newskb, NULL, newskb->dev,
-                                        ip_dev_loopback_xmit);
+                                        dev_loopback_xmit);
                }
                /* Multicasts with ttl 0 must not go beyond the host */
@@ -296,7 +287,7 @@ int ip_mc_output(struct sk_buff *skb)
                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                if (newskb)
                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
-                                NULL, newskb->dev, ip_dev_loopback_xmit);
+                                NULL, newskb->dev, dev_loopback_xmit);
        }
        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -380,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
        skb_dst_set_noref(skb, &rt->dst);
 packet_routed:
-        if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+        if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
                goto no_route;
        /* OK, we know where to send it, allocate and build IP header. */
@@ -709,7 +700,7 @@ slow_path:
                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
        }
-        kfree_skb(skb);
+        consume_skb(skb);
        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
        return err;
@@ -1472,19 +1463,34 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 /*
 *      Generic function to send a packet as reply to another packet.
- *      Used to send TCP resets so far. ICMP should use this function too.
+ *      Used to send some TCP resets/acks so far.
 *
- *      Should run single threaded per socket because it uses the sock
+ *      Use a fake percpu inet socket to avoid false sharing and contention.
- *      structure to pass arguments.
 */
-void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
-                   const struct ip_reply_arg *arg, unsigned int len)
+        .sk = {
+                .__sk_common = {
+                        .skc_refcnt = ATOMIC_INIT(1),
+                },
+                .sk_wmem_alloc  = ATOMIC_INIT(1),
+                .sk_allocation  = GFP_ATOMIC,
+                .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
+        },
+        .pmtudisc       = IP_PMTUDISC_WANT,
+        .uc_ttl         = -1,
+};
+void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
+                           __be32 saddr, const struct ip_reply_arg *arg,
+                           unsigned int len)
 {
-        struct inet_sock *inet = inet_sk(sk);
        struct ip_options_data replyopts;
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
+        struct sk_buff *nskb;
+        struct sock *sk;
+        struct inet_sock *inet;
        if (ip_options_echo(&replyopts.opt.opt, skb))
                return;
@@ -1502,38 +1508,39 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
        flowi4_init_output(&fl4, arg->bound_dev_if, 0,
                           RT_TOS(arg->tos),
-                           RT_SCOPE_UNIVERSE, sk->sk_protocol,
+                           RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                           ip_reply_arg_flowi_flags(arg),
-                           daddr, rt->rt_spec_dst,
+                           daddr, saddr,
                           tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-        rt = ip_route_output_key(sock_net(sk), &fl4);
+        rt = ip_route_output_key(net, &fl4);
        if (IS_ERR(rt))
                return;
-        /* And let IP do all the hard work.
+        inet = &get_cpu_var(unicast_sock);
-           This chunk is not reenterable, hence spinlock.
-           Note that it uses the fact, that this function is called
-           with locally disabled BH and that sk cannot be already spinlocked.
-         */
-        bh_lock_sock(sk);
        inet->tos = arg->tos;
+        sk = &inet->sk;
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
+        sock_net_set(sk, net);
+        __skb_queue_head_init(&sk->sk_write_queue);
+        sk->sk_sndbuf = sysctl_wmem_default;
        ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
                       &ipc, &rt, MSG_DONTWAIT);
-        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+        nskb = skb_peek(&sk->sk_write_queue);
+        if (nskb) {
                if (arg->csumoffset >= 0)
-                        *((__sum16 *)skb_transport_header(skb) +
+                        *((__sum16 *)skb_transport_header(nskb) +
-                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
+                          arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                arg->csum));
-                skb->ip_summed = CHECKSUM_NONE;
+                nskb->ip_summed = CHECKSUM_NONE;
+                skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                ip_push_pending_frames(sk, &fl4);
        }
-        bh_unlock_sock(sk);
+        put_cpu_var(unicast_sock);
        ip_rt_put(rt);
 }
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0d11f234d615..5eea4a811042 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -40,6 +40,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/transp_v6.h>
 #endif
+#include <net/ip_fib.h>
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -1019,18 +1020,17 @@ e_inval:
 * @sk: socket
 * @skb: buffer
 *
- * To support IP_CMSG_PKTINFO option, we store rt_iif and rt_spec_dst
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
- * in skb->cb[] before dst drop.
+ * destination in skb->cb[] before dst drop.
 * This way, receiver doesnt make cache line misses to read rtable.
 */
 void ipv4_pktinfo_prepare(struct sk_buff *skb)
 {
        struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
-        const struct rtable *rt = skb_rtable(skb);
-        if (rt) {
+        if (skb_rtable(skb)) {
-                pktinfo->ipi_ifindex = rt->rt_iif;
+                pktinfo->ipi_ifindex = inet_iif(skb);
-                pktinfo->ipi_spec_dst.s_addr = rt->rt_spec_dst;
+                pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
        } else {
                pktinfo->ipi_ifindex = 0;
                pktinfo->ipi_spec_dst.s_addr = 0;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000000..3511ffba7bd4
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,956 @@
+/*
+ *      Linux NET3: IP/IP protocol decoder modified to support
+ *                  virtual tunnel interface
+ *
+ *      Authors:
+ *              Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ */
+/*
+   This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#define HASH_SIZE  16
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+static int vti_net_id __read_mostly;
+struct vti_net {
+        struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+        struct ip_tunnel __rcu *tunnels_wc[1];
+        struct ip_tunnel __rcu **tunnels[4];
+        struct net_device *fb_tunnel_dev;
+};
+static int vti_fb_tunnel_init(struct net_device *dev);
+static int vti_tunnel_init(struct net_device *dev);
+static void vti_tunnel_setup(struct net_device *dev);
+static void vti_dev_free(struct net_device *dev);
+static int vti_tunnel_bind_dev(struct net_device *dev);
+/* Locking : hash tables are protected by RCU and RTNL */
+#define for_each_ip_tunnel_rcu(start) \
+        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+        u64     rx_packets;
+        u64     rx_bytes;
+        u64     tx_packets;
+        u64     tx_bytes;
+        struct  u64_stats_sync  syncp;
+};
+#define VTI_XMIT(stats1, stats2) do {                           \
+        int err;                                                \
+        int pkt_len = skb->len;                                 \
+        err = dst_output(skb);                                  \
+        if (net_xmit_eval(err) == 0) {                          \
+                u64_stats_update_begin(&(stats1)->syncp);       \
+                (stats1)->tx_bytes += pkt_len;                  \
+                (stats1)->tx_packets++;                         \
+                u64_stats_update_end(&(stats1)->syncp);         \
+        } else {                                                \
+                (stats2)->tx_errors++;                          \
+                (stats2)->tx_aborted_errors++;                  \
+        }                                                       \
+} while (0)
+static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
+                                                 struct rtnl_link_stats64 *tot)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+                unsigned int start;
+                do {
+                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
+                        rx_packets = tstats->rx_packets;
+                        tx_packets = tstats->tx_packets;
+                        rx_bytes = tstats->rx_bytes;
+                        tx_bytes = tstats->tx_bytes;
+                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+                tot->rx_packets += rx_packets;
+                tot->tx_packets += tx_packets;
+                tot->rx_bytes   += rx_bytes;
+                tot->tx_bytes   += tx_bytes;
+        }
+        tot->multicast = dev->stats.multicast;
+        tot->rx_crc_errors = dev->stats.rx_crc_errors;
+        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+        tot->rx_length_errors = dev->stats.rx_length_errors;
+        tot->rx_errors = dev->stats.rx_errors;
+        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+        tot->tx_dropped = dev->stats.tx_dropped;
+        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+        tot->tx_errors = dev->stats.tx_errors;
+        return tot;
+}
+static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
+                                           __be32 remote, __be32 local)
+{
+        unsigned h0 = HASH(remote);
+        unsigned h1 = HASH(local);
+        struct ip_tunnel *t;
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
+                if (local == t->parms.iph.saddr &&
+                    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+                        return t;
+        for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
+                if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+                        return t;
+        for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
+                if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+                        return t;
+        for_each_ip_tunnel_rcu(ipn->tunnels_wc[0])
+                if (t && (t->dev->flags&IFF_UP))
+                        return t;
+        return NULL;
+}
+static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
+                                             struct ip_tunnel_parm *parms)
+{
+        __be32 remote = parms->iph.daddr;
+        __be32 local = parms->iph.saddr;
+        unsigned h = 0;
+        int prio = 0;
+        if (remote) {
+                prio |= 2;
+                h ^= HASH(remote);
+        }
+        if (local) {
+                prio |= 1;
+                h ^= HASH(local);
+        }
+        return &ipn->tunnels[prio][h];
+}
+static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
+                                                  struct ip_tunnel *t)
+{
+        return __vti_bucket(ipn, &t->parms);
+}
+static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
+{
+        struct ip_tunnel __rcu **tp;
+        struct ip_tunnel *iter;
+        for (tp = vti_bucket(ipn, t);
+             (iter = rtnl_dereference(*tp)) != NULL;
+             tp = &iter->next) {
+                if (t == iter) {
+                        rcu_assign_pointer(*tp, t->next);
+                        break;
+                }
+        }
+}
+static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
+{
+        struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
+        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+        rcu_assign_pointer(*tp, t);
+}
+static struct ip_tunnel *vti_tunnel_locate(struct net *net,
+                                           struct ip_tunnel_parm *parms,
+                                           int create)
+{
+        __be32 remote = parms->iph.daddr;
+        __be32 local = parms->iph.saddr;
+        struct ip_tunnel *t, *nt;
+        struct ip_tunnel __rcu **tp;
+        struct net_device *dev;
+        char name[IFNAMSIZ];
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        for (tp = __vti_bucket(ipn, parms);
+             (t = rtnl_dereference(*tp)) != NULL;
+             tp = &t->next) {
+                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+                        return t;
+        }
+        if (!create)
+                return NULL;
+        if (parms->name[0])
+                strlcpy(name, parms->name, IFNAMSIZ);
+        else
+                strcpy(name, "vti%d");
+        dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
+        if (dev == NULL)
+                return NULL;
+        dev_net_set(dev, net);
+        nt = netdev_priv(dev);
+        nt->parms = *parms;
+        dev->rtnl_link_ops = &vti_link_ops;
+        vti_tunnel_bind_dev(dev);
+        if (register_netdevice(dev) < 0)
+                goto failed_free;
+        dev_hold(dev);
+        vti_tunnel_link(ipn, nt);
+        return nt;
+failed_free:
+        free_netdev(dev);
+        return NULL;
+}
+static void vti_tunnel_uninit(struct net_device *dev)
+{
+        struct net *net = dev_net(dev);
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        vti_tunnel_unlink(ipn, netdev_priv(dev));
+        dev_put(dev);
+}
+static int vti_err(struct sk_buff *skb, u32 info)
+{
+        /* All the routers (except for Linux) return only
+         * 8 bytes of packet payload. It means, that precise relaying of
+         * ICMP in the real Internet is absolutely infeasible.
+         */
+        struct iphdr *iph = (struct iphdr *)skb->data;
+        const int type = icmp_hdr(skb)->type;
+        const int code = icmp_hdr(skb)->code;
+        struct ip_tunnel *t;
+        int err;
+        switch (type) {
+        default:
+        case ICMP_PARAMETERPROB:
+                return 0;
+        case ICMP_DEST_UNREACH:
+                switch (code) {
+                case ICMP_SR_FAILED:
+                case ICMP_PORT_UNREACH:
+                        /* Impossible event. */
+                        return 0;
+                default:
+                        /* All others are translated to HOST_UNREACH. */
+                        break;
+                }
+                break;
+        case ICMP_TIME_EXCEEDED:
+                if (code != ICMP_EXC_TTL)
+                        return 0;
+                break;
+        }
+        err = -ENOENT;
+        rcu_read_lock();
+        t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+        if (t == NULL)
+                goto out;
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+                                 t->parms.link, 0, IPPROTO_IPIP, 0);
+                err = 0;
+                goto out;
+        }
+        err = 0;
+        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+                goto out;
+        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+                t->err_count++;
+        else
+                t->err_count = 1;
+        t->err_time = jiffies;
+out:
+        rcu_read_unlock();
+        return err;
+}
+/* We dont digest the packet therefore let the packet pass */
+static int vti_rcv(struct sk_buff *skb)
+{
+        struct ip_tunnel *tunnel;
+        const struct iphdr *iph = ip_hdr(skb);
+        rcu_read_lock();
+        tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+        if (tunnel != NULL) {
+                struct pcpu_tstats *tstats;
+                tstats = this_cpu_ptr(tunnel->dev->tstats);
+                u64_stats_update_begin(&tstats->syncp);
+                tstats->rx_packets++;
+                tstats->rx_bytes += skb->len;
+                u64_stats_update_end(&tstats->syncp);
+                skb->dev = tunnel->dev;
+                rcu_read_unlock();
+                return 1;
+        }
+        rcu_read_unlock();
+        return -1;
+}
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct pcpu_tstats *tstats;
+        struct iphdr  *tiph = &tunnel->parms.iph;
+        u8     tos;
+        struct rtable *rt;              /* Route to the other host */
+        struct net_device *tdev;        /* Device to other host */
+        struct iphdr  *old_iph = ip_hdr(skb);
+        __be32 dst = tiph->daddr;
+        struct flowi4 fl4;
+        if (skb->protocol != htons(ETH_P_IP))
+                goto tx_error;
+        tos = old_iph->tos;
+        memset(&fl4, 0, sizeof(fl4));
+        flowi4_init_output(&fl4, tunnel->parms.link,
+                           htonl(tunnel->parms.i_key), RT_TOS(tos),
+                           RT_SCOPE_UNIVERSE,
+                           IPPROTO_IPIP, 0,
+                           dst, tiph->saddr, 0, 0);
+        rt = ip_route_output_key(dev_net(dev), &fl4);
+        if (IS_ERR(rt)) {
+                dev->stats.tx_carrier_errors++;
+                goto tx_error_icmp;
+        }
+        /* if there is no transform then this tunnel is not functional.
+         * Or if the xfrm is not mode tunnel.
+         */
+        if (!rt->dst.xfrm ||
+            rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
+                dev->stats.tx_carrier_errors++;
+                goto tx_error_icmp;
+        }
+        tdev = rt->dst.dev;
+        if (tdev == dev) {
+                ip_rt_put(rt);
+                dev->stats.collisions++;
+                goto tx_error;
+        }
+        if (tunnel->err_count > 0) {
+                if (time_before(jiffies,
+                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+                        tunnel->err_count--;
+                        dst_link_failure(skb);
+                } else
+                        tunnel->err_count = 0;
+        }
+        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+                              IPSKB_REROUTED);
+        skb_dst_drop(skb);
+        skb_dst_set(skb, &rt->dst);
+        nf_reset(skb);
+        skb->dev = skb_dst(skb)->dev;
+        tstats = this_cpu_ptr(dev->tstats);
+        VTI_XMIT(tstats, &dev->stats);
+        return NETDEV_TX_OK;
+tx_error_icmp:
+        dst_link_failure(skb);
+tx_error:
+        dev->stats.tx_errors++;
+        dev_kfree_skb(skb);
+        return NETDEV_TX_OK;
+}
+static int vti_tunnel_bind_dev(struct net_device *dev)
+{
+        struct net_device *tdev = NULL;
+        struct ip_tunnel *tunnel;
+        struct iphdr *iph;
+        tunnel = netdev_priv(dev);
+        iph = &tunnel->parms.iph;
+        if (iph->daddr) {
+                struct rtable *rt;
+                struct flowi4 fl4;
+                memset(&fl4, 0, sizeof(fl4));
+                flowi4_init_output(&fl4, tunnel->parms.link,
+                                   htonl(tunnel->parms.i_key),
+                                   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
+                                   IPPROTO_IPIP, 0,
+                                   iph->daddr, iph->saddr, 0, 0);
+                rt = ip_route_output_key(dev_net(dev), &fl4);
+                if (!IS_ERR(rt)) {
+                        tdev = rt->dst.dev;
+                        ip_rt_put(rt);
+                }
+                dev->flags |= IFF_POINTOPOINT;
+        }
+        if (!tdev && tunnel->parms.link)
+                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+        if (tdev) {
+                dev->hard_header_len = tdev->hard_header_len +
+                                       sizeof(struct iphdr);
+                dev->mtu = tdev->mtu;
+        }
+        dev->iflink = tunnel->parms.link;
+        return dev->mtu;
+}
+static int
+vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+        int err = 0;
+        struct ip_tunnel_parm p;
+        struct ip_tunnel *t;
+        struct net *net = dev_net(dev);
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        switch (cmd) {
+        case SIOCGETTUNNEL:
+                t = NULL;
+                if (dev == ipn->fb_tunnel_dev) {
+                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
+                                           sizeof(p))) {
+                                err = -EFAULT;
+                                break;
+                        }
+                        t = vti_tunnel_locate(net, &p, 0);
+                }
+                if (t == NULL)
+                        t = netdev_priv(dev);
+                memcpy(&p, &t->parms, sizeof(p));
+                p.i_flags |= GRE_KEY | VTI_ISVTI;
+                p.o_flags |= GRE_KEY;
+                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+                        err = -EFAULT;
+                break;
+        case SIOCADDTUNNEL:
+        case SIOCCHGTUNNEL:
+                err = -EPERM;
+                if (!capable(CAP_NET_ADMIN))
+                        goto done;
+                err = -EFAULT;
+                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+                        goto done;
+                err = -EINVAL;
+                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+                    p.iph.ihl != 5)
+                        goto done;
+                t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+                if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+                        if (t != NULL) {
+                                if (t->dev != dev) {
+                                        err = -EEXIST;
+                                        break;
+                                }
+                        } else {
+                                if (((dev->flags&IFF_POINTOPOINT) &&
+                                    !p.iph.daddr) ||
+                                    (!(dev->flags&IFF_POINTOPOINT) &&
+                                    p.iph.daddr)) {
+                                        err = -EINVAL;
+                                        break;
+                                }
+                                t = netdev_priv(dev);
+                                vti_tunnel_unlink(ipn, t);
+                                synchronize_net();
+                                t->parms.iph.saddr = p.iph.saddr;
+                                t->parms.iph.daddr = p.iph.daddr;
+                                t->parms.i_key = p.i_key;
+                                t->parms.o_key = p.o_key;
+                                t->parms.iph.protocol = IPPROTO_IPIP;
+                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                                memcpy(dev->broadcast, &p.iph.daddr, 4);
+                                vti_tunnel_link(ipn, t);
+                                netdev_state_change(dev);
+                        }
+                }
+                if (t) {
+                        err = 0;
+                        if (cmd == SIOCCHGTUNNEL) {
+                                t->parms.i_key = p.i_key;
+                                t->parms.o_key = p.o_key;
+                                if (t->parms.link != p.link) {
+                                        t->parms.link = p.link;
+                                        vti_tunnel_bind_dev(dev);
+                                        netdev_state_change(dev);
+                                }
+                        }
+                        p.i_flags |= GRE_KEY | VTI_ISVTI;
+                        p.o_flags |= GRE_KEY;
+                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
+                                         sizeof(p)))
+                                err = -EFAULT;
+                } else
+                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+                break;
+        case SIOCDELTUNNEL:
+                err = -EPERM;
+                if (!capable(CAP_NET_ADMIN))
+                        goto done;
+                if (dev == ipn->fb_tunnel_dev) {
+                        err = -EFAULT;
+                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
+                                           sizeof(p)))
+                                goto done;
+                        err = -ENOENT;
+                        t = vti_tunnel_locate(net, &p, 0);
+                        if (t == NULL)
+                                goto done;
+                        err = -EPERM;
+                        if (t->dev == ipn->fb_tunnel_dev)
+                                goto done;
+                        dev = t->dev;
+                }
+                unregister_netdevice(dev);
+                err = 0;
+                break;
+        default:
+                err = -EINVAL;
+        }
+done:
+        return err;
+}
+static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+        if (new_mtu < 68 || new_mtu > 0xFFF8)
+                return -EINVAL;
+        dev->mtu = new_mtu;
+        return 0;
+}
+static const struct net_device_ops vti_netdev_ops = {
+        .ndo_init       = vti_tunnel_init,
+        .ndo_uninit     = vti_tunnel_uninit,
+        .ndo_start_xmit = vti_tunnel_xmit,
+        .ndo_do_ioctl   = vti_tunnel_ioctl,
+        .ndo_change_mtu = vti_tunnel_change_mtu,
+        .ndo_get_stats64 = vti_get_stats64,
+};
+static void vti_dev_free(struct net_device *dev)
+{
+        free_percpu(dev->tstats);
+        free_netdev(dev);
+}
+static void vti_tunnel_setup(struct net_device *dev)
+{
+        dev->netdev_ops         = &vti_netdev_ops;
+        dev->destructor         = vti_dev_free;
+        dev->type               = ARPHRD_TUNNEL;
+        dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
+        dev->mtu                = ETH_DATA_LEN;
+        dev->flags              = IFF_NOARP;
+        dev->iflink             = 0;
+        dev->addr_len           = 4;
+        dev->features           |= NETIF_F_NETNS_LOCAL;
+        dev->features           |= NETIF_F_LLTX;
+        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
+}
+static int vti_tunnel_init(struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = netdev_priv(dev);
+        tunnel->dev = dev;
+        strcpy(tunnel->parms.name, dev->name);
+        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+        dev->tstats = alloc_percpu(struct pcpu_tstats);
+        if (!dev->tstats)
+                return -ENOMEM;
+        return 0;
+}
+static int __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct iphdr *iph = &tunnel->parms.iph;
+        struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
+        tunnel->dev = dev;
+        strcpy(tunnel->parms.name, dev->name);
+        iph->version            = 4;
+        iph->protocol           = IPPROTO_IPIP;
+        iph->ihl                = 5;
+        dev->tstats = alloc_percpu(struct pcpu_tstats);
+        if (!dev->tstats)
+                return -ENOMEM;
+        dev_hold(dev);
+        rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+        return 0;
+}
+static struct xfrm_tunnel vti_handler __read_mostly = {
+        .handler        =       vti_rcv,
+        .err_handler    =       vti_err,
+        .priority       =       1,
+};
+static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
+{
+        int prio;
+        for (prio = 1; prio < 4; prio++) {
+                int h;
+                for (h = 0; h < HASH_SIZE; h++) {
+                        struct ip_tunnel *t;
+                        t = rtnl_dereference(ipn->tunnels[prio][h]);
+                        while (t != NULL) {
+                                unregister_netdevice_queue(t->dev, head);
+                                t = rtnl_dereference(t->next);
+                        }
+                }
+        }
+}
+static int __net_init vti_init_net(struct net *net)
+{
+        int err;
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        ipn->tunnels[0] = ipn->tunnels_wc;
+        ipn->tunnels[1] = ipn->tunnels_l;
+        ipn->tunnels[2] = ipn->tunnels_r;
+        ipn->tunnels[3] = ipn->tunnels_r_l;
+        ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+                                          "ip_vti0",
+                                          vti_tunnel_setup);
+        if (!ipn->fb_tunnel_dev) {
+                err = -ENOMEM;
+                goto err_alloc_dev;
+        }
+        dev_net_set(ipn->fb_tunnel_dev, net);
+        err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
+        if (err)
+                goto err_reg_dev;
+        ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
+        err = register_netdev(ipn->fb_tunnel_dev);
+        if (err)
+                goto err_reg_dev;
+        return 0;
+err_reg_dev:
+        vti_dev_free(ipn->fb_tunnel_dev);
+err_alloc_dev:
+        /* nothing */
+        return err;
+}
+static void __net_exit vti_exit_net(struct net *net)
+{
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        LIST_HEAD(list);
+        rtnl_lock();
+        vti_destroy_tunnels(ipn, &list);
+        unregister_netdevice_many(&list);
+        rtnl_unlock();
+}
+static struct pernet_operations vti_net_ops = {
+        .init = vti_init_net,
+        .exit = vti_exit_net,
+        .id   = &vti_net_id,
+        .size = sizeof(struct vti_net),
+};
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+        return 0;
+}
+static void vti_netlink_parms(struct nlattr *data[],
+                              struct ip_tunnel_parm *parms)
+{
+        memset(parms, 0, sizeof(*parms));
+        parms->iph.protocol = IPPROTO_IPIP;
+        if (!data)
+                return;
+        if (data[IFLA_VTI_LINK])
+                parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+        if (data[IFLA_VTI_IKEY])
+                parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+        if (data[IFLA_VTI_OKEY])
+                parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+        if (data[IFLA_VTI_LOCAL])
+                parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
+        if (data[IFLA_VTI_REMOTE])
+                parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
+}
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+                       struct nlattr *tb[], struct nlattr *data[])
+{
+        struct ip_tunnel *nt;
+        struct net *net = dev_net(dev);
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        int mtu;
+        int err;
+        nt = netdev_priv(dev);
+        vti_netlink_parms(data, &nt->parms);
+        if (vti_tunnel_locate(net, &nt->parms, 0))
+                return -EEXIST;
+        mtu = vti_tunnel_bind_dev(dev);
+        if (!tb[IFLA_MTU])
+                dev->mtu = mtu;
+        err = register_netdevice(dev);
+        if (err)
+                goto out;
+        dev_hold(dev);
+        vti_tunnel_link(ipn, nt);
+out:
+        return err;
+}
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+                          struct nlattr *data[])
+{
+        struct ip_tunnel *t, *nt;
+        struct net *net = dev_net(dev);
+        struct vti_net *ipn = net_generic(net, vti_net_id);
+        struct ip_tunnel_parm p;
+        int mtu;
+        if (dev == ipn->fb_tunnel_dev)
+                return -EINVAL;
+        nt = netdev_priv(dev);
+        vti_netlink_parms(data, &p);
+        t = vti_tunnel_locate(net, &p, 0);
+        if (t) {
+                if (t->dev != dev)
+                        return -EEXIST;
+        } else {
+                t = nt;
+                vti_tunnel_unlink(ipn, t);
+                t->parms.iph.saddr = p.iph.saddr;
+                t->parms.iph.daddr = p.iph.daddr;
+                t->parms.i_key = p.i_key;
+                t->parms.o_key = p.o_key;
+                if (dev->type != ARPHRD_ETHER) {
+                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                        memcpy(dev->broadcast, &p.iph.daddr, 4);
+                }
+                vti_tunnel_link(ipn, t);
+                netdev_state_change(dev);
+        }
+        if (t->parms.link != p.link) {
+                t->parms.link = p.link;
+                mtu = vti_tunnel_bind_dev(dev);
+                if (!tb[IFLA_MTU])
+                        dev->mtu = mtu;
+                netdev_state_change(dev);
+        }
+        return 0;
+}
+static size_t vti_get_size(const struct net_device *dev)
+{
+        return
+                /* IFLA_VTI_LINK */
+                nla_total_size(4) +
+                /* IFLA_VTI_IKEY */
+                nla_total_size(4) +
+                /* IFLA_VTI_OKEY */
+                nla_total_size(4) +
+                /* IFLA_VTI_LOCAL */
+                nla_total_size(4) +
+                /* IFLA_VTI_REMOTE */
+                nla_total_size(4) +
+                0;
+}
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+        struct ip_tunnel *t = netdev_priv(dev);
+        struct ip_tunnel_parm *p = &t->parms;
+        nla_put_u32(skb, IFLA_VTI_LINK, p->link);
+        nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
+        nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
+        nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+        nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+        return 0;
+}
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+        [IFLA_VTI_LINK]         = { .type = NLA_U32 },
+        [IFLA_VTI_IKEY]         = { .type = NLA_U32 },
+        [IFLA_VTI_OKEY]         = { .type = NLA_U32 },
+        [IFLA_VTI_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+        [IFLA_VTI_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+};
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+        .kind           = "vti",
+        .maxtype        = IFLA_VTI_MAX,
+        .policy         = vti_policy,
+        .priv_size      = sizeof(struct ip_tunnel),
+        .setup          = vti_tunnel_setup,
+        .validate       = vti_tunnel_validate,
+        .newlink        = vti_newlink,
+        .changelink     = vti_changelink,
+        .get_size       = vti_get_size,
+        .fill_info      = vti_fill_info,
+};
+static int __init vti_init(void)
+{
+        int err;
+        pr_info("IPv4 over IPSec tunneling driver\n");
+        err = register_pernet_device(&vti_net_ops);
+        if (err < 0)
+                return err;
+        err = xfrm4_mode_tunnel_input_register(&vti_handler);
+        if (err < 0) {
+                unregister_pernet_device(&vti_net_ops);
+                pr_info(KERN_INFO "vti init: can't register tunnel\n");
+        }
+        err = rtnl_link_register(&vti_link_ops);
+        if (err < 0)
+                goto rtnl_link_failed;
+        return err;
+rtnl_link_failed:
+        xfrm4_mode_tunnel_input_deregister(&vti_handler);
+        unregister_pernet_device(&vti_net_ops);
+        return err;
+}
+static void __exit vti_fini(void)
+{
+        rtnl_link_unregister(&vti_link_ops);
+        if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
+                pr_info("vti close: can't deregister tunnel\n");
+        unregister_pernet_device(&vti_net_ops);
+}
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 63b64c45a826..d3ab47e19a89 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -31,17 +31,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
        struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
-        if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+        switch (icmp_hdr(skb)->type) {
-            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+        case ICMP_DEST_UNREACH:
+                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+                        return;
+        case ICMP_REDIRECT:
+                break;
+        default:
                return;
+        }
        spi = htonl(ntohs(ipch->cpi));
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET);
        if (!x)
                return;
-        NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
-                 spi, &iph->daddr);
+        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+        else
+                ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
        xfrm_state_put(x);
 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2d0f99bf61b3..99af1f0cc658 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -348,9 +348,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
                case ICMP_PORT_UNREACH:
                        /* Impossible event. */
                        return 0;
-                case ICMP_FRAG_NEEDED:
-                        /* Soft state for pmtu is maintained by IP core. */
-                        return 0;
                default:
                        /* All others are translated to HOST_UNREACH.
                           rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -363,13 +360,32 @@ static int ipip_err(struct sk_buff *skb, u32 info)
                if (code != ICMP_EXC_TTL)
                        return 0;
                break;
+        case ICMP_REDIRECT:
+                break;
        }
        err = -ENOENT;
        rcu_read_lock();
        t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
-        if (t == NULL || t->parms.iph.daddr == 0)
+        if (t == NULL)
+                goto out;
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+                                 t->dev->ifindex, 0, IPPROTO_IPIP, 0);
+                err = 0;
+                goto out;
+        }
+        if (type == ICMP_REDIRECT) {
+                ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
+                              IPPROTO_IPIP, 0);
+                err = 0;
+                goto out;
+        }
+        if (t->parms.iph.daddr == 0)
                goto out;
        err = 0;
@@ -471,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                        dev->stats.tx_fifo_errors++;
                        goto tx_error;
                }
-                dst = rt->rt_gateway;
+                dst = rt_nexthop(rt, old_iph->daddr);
        }
        rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
@@ -503,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                }
                if (skb_dst(skb))
-                        skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+                        skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
                if ((old_iph->frag_off & htons(IP_DF)) &&
                    mtu < ntohs(old_iph->tot_len)) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c94bbc6f2ba3..8eec8f4a0536 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -524,8 +524,8 @@ failure:
 }
 #endif
-/*
+/**
- *      Delete a VIF entry
+ *      vif_delete - Delete a VIF entry
 *      @notify: Set to 1, if the caller is a notifier_call
 */
@@ -1795,9 +1795,12 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowi4_tos = RT_TOS(iph->tos),
-                .flowi4_oif = rt->rt_oif,
+                .flowi4_oif = (rt_is_output_route(rt) ?
-                .flowi4_iif = rt->rt_iif,
+                               skb->dev->ifindex : 0),
-                .flowi4_mark = rt->rt_mark,
+                .flowi4_iif = (rt_is_output_route(rt) ?
+                               net->loopback_dev->ifindex :
+                               skb->dev->ifindex),
+                .flowi4_mark = skb->mark,
        };
        struct mr_table *mrt;
        int err;
@@ -2006,37 +2009,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 {
        int ct;
        struct rtnexthop *nhp;
-        u8 *b = skb_tail_pointer(skb);
+        struct nlattr *mp_attr;
-        struct rtattr *mp_head;
        /* If cache is unresolved, don't try to parse IIF and OIF */
        if (c->mfc_parent >= MAXVIFS)
                return -ENOENT;
-        if (VIF_EXISTS(mrt, c->mfc_parent))
+        if (VIF_EXISTS(mrt, c->mfc_parent) &&
-                RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
+            nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+                return -EMSGSIZE;
-        mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
+        if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+                return -EMSGSIZE;
        for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
                if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
-                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+                        if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
-                                goto rtattr_failure;
+                                nla_nest_cancel(skb, mp_attr);
-                        nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+                                return -EMSGSIZE;
+                        }
                        nhp->rtnh_flags = 0;
                        nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
                        nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
                        nhp->rtnh_len = sizeof(*nhp);
                }
        }
-        mp_head->rta_type = RTA_MULTIPATH;
-        mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+        nla_nest_end(skb, mp_attr);
        rtm->rtm_type = RTN_MULTICAST;
        return 1;
-rtattr_failure:
-        nlmsg_trim(skb, b);
-        return -EMSGSIZE;
 }
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 2f210c79dc87..cbb6a1a6f6f7 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        struct nf_nat_ipv4_range newrange;
        const struct nf_nat_ipv4_multi_range_compat *mr;
        const struct rtable *rt;
-        __be32 newsrc;
+        __be32 newsrc, nh;
        NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        mr = par->targinfo;
        rt = skb_rtable(skb);
-        newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+        nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+        newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
        if (!newsrc) {
                pr_info("%s ate my IP address\n", par->out->name);
                return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ba5756d20165..1109f7f6c254 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum,
        pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
-        /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
+        nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
-        nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+                        sizeof(*pm)+copy_len, 0);
-                        sizeof(*pm)+copy_len);
+        if (!nlh) {
+                pr_debug("error during nlmsg_put\n");
+                goto out_unlock;
+        }
        ub->qlen++;
-        pm = NLMSG_DATA(nlh);
+        pm = nlmsg_data(nlh);
        /* We might not have a timestamp, get one */
        if (skb->tstamp.tv64 == 0)
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
                        nlh->nlmsg_type = NLMSG_DONE;
                ulog_send(groupnum);
        }
+out_unlock:
        spin_unlock_bh(&ulog_lock);
        return;
-nlmsg_failure:
-        pr_debug("error during NLMSG_PUT\n");
 alloc_failure:
        pr_debug("Error building netlink message\n");
        spin_unlock_bh(&ulog_lock);
@@ -380,6 +381,9 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 static int __init ulog_tg_init(void)
 {
        int ret, i;
+        struct netlink_kernel_cfg cfg = {
+                .groups = ULOG_MAXNLGROUPS,
+        };
        pr_debug("init module\n");
@@ -392,9 +396,8 @@ static int __init ulog_tg_init(void)
        for (i = 0; i < ULOG_MAXNLGROUPS; i++)
                setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
-        nflognl = netlink_kernel_create(&init_net,
+        nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
-                                        NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+                                        THIS_MODULE, &cfg);
-                                        NULL, THIS_MODULE);
        if (!nflognl)
                return -ENOMEM;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 91747d4ebc26..e7ff2dcab6ce 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -95,11 +95,11 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
        return NF_ACCEPT;
 }
-static unsigned int ipv4_confirm(unsigned int hooknum,
+static unsigned int ipv4_helper(unsigned int hooknum,
-                                 struct sk_buff *skb,
+                                struct sk_buff *skb,
-                                 const struct net_device *in,
+                                const struct net_device *in,
-                                 const struct net_device *out,
+                                const struct net_device *out,
-                                 int (*okfn)(struct sk_buff *))
+                                int (*okfn)(struct sk_buff *))
 {
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
@@ -110,24 +110,38 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
        /* This is where we call the helper: as the packet goes out. */
        ct = nf_ct_get(skb, &ctinfo);
        if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-                goto out;
+                return NF_ACCEPT;
        help = nfct_help(ct);
        if (!help)
-                goto out;
+                return NF_ACCEPT;
        /* rcu_read_lock()ed by nf_hook_slow */
        helper = rcu_dereference(help->helper);
        if (!helper)
-                goto out;
+                return NF_ACCEPT;
        ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
                           ct, ctinfo);
-        if (ret != NF_ACCEPT) {
+        if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
                nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
                              "nf_ct_%s: dropping packet", helper->name);
-                return ret;
        }
+        return ret;
+}
+static unsigned int ipv4_confirm(unsigned int hooknum,
+                                 struct sk_buff *skb,
+                                 const struct net_device *in,
+                                 const struct net_device *out,
+                                 int (*okfn)(struct sk_buff *))
+{
+        struct nf_conn *ct;
+        enum ip_conntrack_info ctinfo;
+        ct = nf_ct_get(skb, &ctinfo);
+        if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+                goto out;
        /* adjust seqs for loopback traffic only in outgoing direction */
        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -185,6 +199,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
                .priority       = NF_IP_PRI_CONNTRACK,
        },
        {
+                .hook           = ipv4_helper,
+                .owner          = THIS_MODULE,
+                .pf             = NFPROTO_IPV4,
+                .hooknum        = NF_INET_POST_ROUTING,
+                .priority       = NF_IP_PRI_CONNTRACK_HELPER,
+        },
+        {
                .hook           = ipv4_confirm,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
@@ -192,6 +213,13 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
                .priority       = NF_IP_PRI_CONNTRACK_CONFIRM,
        },
        {
+                .hook           = ipv4_helper,
+                .owner          = THIS_MODULE,
+                .pf             = NFPROTO_IPV4,
+                .hooknum        = NF_INET_LOCAL_IN,
+                .priority       = NF_IP_PRI_CONNTRACK_HELPER,
+        },
+        {
                .hook           = ipv4_confirm,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
@@ -207,35 +235,30 @@ static int log_invalid_proto_max = 255;
 static ctl_table ip_ct_sysctl_table[] = {
        {
                .procname       = "ip_conntrack_max",
-                .data           = &nf_conntrack_max,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_count",
-                .data           = &init_net.ct.count,
                .maxlen         = sizeof(int),
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_buckets",
-                .data           = &init_net.ct.htable_size,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_checksum",
-                .data           = &init_net.ct.sysctl_checksum,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_log_invalid",
-                .data           = &init_net.ct.sysctl_log_invalid,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
@@ -351,6 +374,25 @@ static struct nf_sockopt_ops so_getorigdst = {
        .owner          = THIS_MODULE,
 };
+static int ipv4_init_net(struct net *net)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+        struct nf_ip_net *in = &net->ct.nf_ct_proto;
+        in->ctl_table = kmemdup(ip_ct_sysctl_table,
+                                sizeof(ip_ct_sysctl_table),
+                                GFP_KERNEL);
+        if (!in->ctl_table)
+                return -ENOMEM;
+        in->ctl_table[0].data = &nf_conntrack_max;
+        in->ctl_table[1].data = &net->ct.count;
+        in->ctl_table[2].data = &net->ct.htable_size;
+        in->ctl_table[3].data = &net->ct.sysctl_checksum;
+        in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
+#endif
+        return 0;
+}
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
        .l3proto         = PF_INET,
        .name            = "ipv4",
@@ -366,8 +408,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 #endif
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
        .ctl_table_path  = "net/ipv4/netfilter",
-        .ctl_table       = ip_ct_sysctl_table,
 #endif
+        .init_net        = ipv4_init_net,
        .me              = THIS_MODULE,
 };
@@ -378,6 +420,65 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
 MODULE_ALIAS("ip_conntrack");
 MODULE_LICENSE("GPL");
+static int ipv4_net_init(struct net *net)
+{
+        int ret = 0;
+        ret = nf_conntrack_l4proto_register(net,
+                                            &nf_conntrack_l4proto_tcp4);
+        if (ret < 0) {
+                pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
+                goto out_tcp;
+        }
+        ret = nf_conntrack_l4proto_register(net,
+                                            &nf_conntrack_l4proto_udp4);
+        if (ret < 0) {
+                pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
+                goto out_udp;
+        }
+        ret = nf_conntrack_l4proto_register(net,
+                                            &nf_conntrack_l4proto_icmp);
+        if (ret < 0) {
+                pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
+                goto out_icmp;
+        }
+        ret = nf_conntrack_l3proto_register(net,
+                                            &nf_conntrack_l3proto_ipv4);
+        if (ret < 0) {
+                pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
+                goto out_ipv4;
+        }
+        return 0;
+out_ipv4:
+        nf_conntrack_l4proto_unregister(net,
+                                        &nf_conntrack_l4proto_icmp);
+out_icmp:
+        nf_conntrack_l4proto_unregister(net,
+                                        &nf_conntrack_l4proto_udp4);
+out_udp:
+        nf_conntrack_l4proto_unregister(net,
+                                        &nf_conntrack_l4proto_tcp4);
+out_tcp:
+        return ret;
+}
+static void ipv4_net_exit(struct net *net)
+{
+        nf_conntrack_l3proto_unregister(net,
+                                        &nf_conntrack_l3proto_ipv4);
+        nf_conntrack_l4proto_unregister(net,
+                                        &nf_conntrack_l4proto_icmp);
+        nf_conntrack_l4proto_unregister(net,
+                                        &nf_conntrack_l4proto_udp4);
+        nf_conntrack_l4proto_unregister(net,
+                                        &nf_conntrack_l4proto_tcp4);
+}
+static struct pernet_operations ipv4_net_ops = {
+        .init = ipv4_net_init,
+        .exit = ipv4_net_exit,
+};
 static int __init nf_conntrack_l3proto_ipv4_init(void)
 {
        int ret = 0;
@@ -391,35 +492,17 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
                return ret;
        }
-        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
+        ret = register_pernet_subsys(&ipv4_net_ops);
        if (ret < 0) {
-                pr_err("nf_conntrack_ipv4: can't register tcp.\n");
+                pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
                goto cleanup_sockopt;
        }
-        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
-        if (ret < 0) {
-                pr_err("nf_conntrack_ipv4: can't register udp.\n");
-                goto cleanup_tcp;
-        }
-        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
-        if (ret < 0) {
-                pr_err("nf_conntrack_ipv4: can't register icmp.\n");
-                goto cleanup_udp;
-        }
-        ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
-        if (ret < 0) {
-                pr_err("nf_conntrack_ipv4: can't register ipv4\n");
-                goto cleanup_icmp;
-        }
        ret = nf_register_hooks(ipv4_conntrack_ops,
                                ARRAY_SIZE(ipv4_conntrack_ops));
        if (ret < 0) {
                pr_err("nf_conntrack_ipv4: can't register hooks.\n");
-                goto cleanup_ipv4;
+                goto cleanup_pernet;
        }
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
        ret = nf_conntrack_ipv4_compat_init();
@@ -431,14 +514,8 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 cleanup_hooks:
        nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
 #endif
- cleanup_ipv4:
+ cleanup_pernet:
-        nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+        unregister_pernet_subsys(&ipv4_net_ops);
- cleanup_icmp:
-        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
- cleanup_udp:
-        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
- cleanup_tcp:
-        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
 cleanup_sockopt:
        nf_unregister_sockopt(&so_getorigdst);
        return ret;
@@ -451,10 +528,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
        nf_conntrack_ipv4_compat_fini();
 #endif
        nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-        nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+        unregister_pernet_subsys(&ipv4_net_ops);
-        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
-        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
-        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
        nf_unregister_sockopt(&so_getorigdst);
 }
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 0847e373d33c..5241d997ab75 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,6 +23,11 @@
 static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+static inline struct nf_icmp_net *icmp_pernet(struct net *net)
+{
+        return &net->ct.nf_ct_proto.icmp;
+}
 static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
                              struct nf_conntrack_tuple *tuple)
 {
@@ -77,7 +82,7 @@ static int icmp_print_tuple(struct seq_file *s,
 static unsigned int *icmp_get_timeouts(struct net *net)
 {
-        return &nf_ct_icmp_timeout;
+        return &icmp_pernet(net)->timeout;
 }
 /* Returns verdict for packet, or -1 for invalid. */
@@ -274,16 +279,18 @@ static int icmp_nlattr_tuple_size(void)
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_cttimeout.h>
-static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data)
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
+                                      struct net *net, void *data)
 {
        unsigned int *timeout = data;
+        struct nf_icmp_net *in = icmp_pernet(net);
        if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
                *timeout =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
        } else {
                /* Set default ICMP timeout. */
-                *timeout = nf_ct_icmp_timeout;
+                *timeout = in->timeout;
        }
        return 0;
 }
@@ -308,11 +315,9 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_header *icmp_sysctl_header;
 static struct ctl_table icmp_sysctl_table[] = {
        {
                .procname       = "nf_conntrack_icmp_timeout",
-                .data           = &nf_ct_icmp_timeout,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
@@ -323,7 +328,6 @@ static struct ctl_table icmp_sysctl_table[] = {
 static struct ctl_table icmp_compat_sysctl_table[] = {
        {
                .procname       = "ip_conntrack_icmp_timeout",
-                .data           = &nf_ct_icmp_timeout,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
@@ -333,6 +337,62 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
+static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+                                     struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+        pn->ctl_table = kmemdup(icmp_sysctl_table,
+                                sizeof(icmp_sysctl_table),
+                                GFP_KERNEL);
+        if (!pn->ctl_table)
+                return -ENOMEM;
+        pn->ctl_table[0].data = &in->timeout;
+#endif
+        return 0;
+}
+static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+                                            struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+        pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
+                                       sizeof(icmp_compat_sysctl_table),
+                                       GFP_KERNEL);
+        if (!pn->ctl_compat_table)
+                return -ENOMEM;
+        pn->ctl_compat_table[0].data = &in->timeout;
+#endif
+#endif
+        return 0;
+}
+static int icmp_init_net(struct net *net, u_int16_t proto)
+{
+        int ret;
+        struct nf_icmp_net *in = icmp_pernet(net);
+        struct nf_proto_net *pn = &in->pn;
+        in->timeout = nf_ct_icmp_timeout;
+        ret = icmp_kmemdup_compat_sysctl_table(pn, in);
+        if (ret < 0)
+                return ret;
+        ret = icmp_kmemdup_sysctl_table(pn, in);
+        if (ret < 0)
+                nf_ct_kfree_compat_sysctl_table(pn);
+        return ret;
+}
+static struct nf_proto_net *icmp_get_net_proto(struct net *net)
+{
+        return &net->ct.nf_ct_proto.icmp.pn;
+}
 struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 {
        .l3proto                = PF_INET,
@@ -362,11 +422,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
                .nla_policy     = icmp_timeout_nla_policy,
        },
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
+        .init_net               = icmp_init_net,
-        .ctl_table_header       = &icmp_sysctl_header,
+        .get_net_proto          = icmp_get_net_proto,
-        .ctl_table              = icmp_sysctl_table,
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-        .ctl_compat_table       = icmp_compat_sysctl_table,
-#endif
-#endif
 };
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9bb1b8a37a22..742815518b0f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
        {
                .hook           = ipv4_conntrack_defrag,
                .owner          = THIS_MODULE,
-                .pf             = PF_INET,
+                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
        },
        {
                .hook           = ipv4_conntrack_defrag,
                .owner          = THIS_MODULE,
-                .pf             = PF_INET,
+                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
        },
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 7b22382ff0e9..3c04d24e2976 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -13,10 +13,10 @@
 #include <linux/skbuff.h>
 #include <linux/udp.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter/nf_conntrack_amanda.h>
 MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index abb52adf5acd..44b082fd48ab 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -691,6 +691,10 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
        .expectfn       = nf_nat_follow_master,
 };
+static struct nfq_ct_nat_hook nfq_ct_nat = {
+        .seq_adjust     = nf_nat_tcp_seq_adjust,
+};
 static int __init nf_nat_init(void)
 {
        size_t i;
@@ -731,6 +735,7 @@ static int __init nf_nat_init(void)
                           nfnetlink_parse_nat_setup);
        BUG_ON(nf_ct_nat_offset != NULL);
        RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
+        RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
        return 0;
 cleanup_extend:
@@ -747,6 +752,7 @@ static void __exit nf_nat_cleanup(void)
        RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
        RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
        RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
+        RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
        synchronize_net();
 }
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index cad29c121318..c6784a18c1c4 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -95,7 +95,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                        unsigned char **data,
                        TransportAddress *taddr, int count)
 {
-        const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+        const struct nf_ct_h323_master *info = nfct_help_data(ct);
        int dir = CTINFO2DIR(ctinfo);
        int i;
        __be16 port;
@@ -178,7 +178,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
                        struct nf_conntrack_expect *rtp_exp,
                        struct nf_conntrack_expect *rtcp_exp)
 {
-        struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+        struct nf_ct_h323_master *info = nfct_help_data(ct);
        int dir = CTINFO2DIR(ctinfo);
        int i;
        u_int16_t nated_port;
@@ -330,7 +330,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
                    TransportAddress *taddr, __be16 port,
                    struct nf_conntrack_expect *exp)
 {
-        struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+        struct nf_ct_h323_master *info = nfct_help_data(ct);
        int dir = CTINFO2DIR(ctinfo);
        u_int16_t nated_port = ntohs(port);
@@ -419,7 +419,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
                    unsigned char **data, TransportAddress *taddr, int idx,
                    __be16 port, struct nf_conntrack_expect *exp)
 {
-        struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+        struct nf_ct_h323_master *info = nfct_help_data(ct);
        int dir = CTINFO2DIR(ctinfo);
        u_int16_t nated_port = ntohs(port);
        union nf_inet_addr addr;
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index af65958f6308..2e59ad0b90ca 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,19 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 }
 EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+                           u32 ctinfo, int off)
+{
+        const struct tcphdr *th;
+        if (nf_ct_protonum(ct) != IPPROTO_TCP)
+                return;
+        th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
+        nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
 static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
                        int datalen, __sum16 *check, int oldlen)
 {
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index c273d58980ae..388140881ebe 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -49,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
        const struct nf_nat_pptp *nat_pptp_info;
        struct nf_nat_ipv4_range range;
-        ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
+        ct_pptp_info = nfct_help_data(master);
        nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
        /* And here goes the grand finale of corrosion... */
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
        __be16 new_callid;
        unsigned int cid_off;
-        ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
+        ct_pptp_info = nfct_help_data(ct);
        nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
        new_callid = ct_pptp_info->pns_call_id;
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
        struct nf_ct_pptp_master *ct_pptp_info;
        struct nf_nat_pptp *nat_pptp_info;
-        ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
+        ct_pptp_info = nfct_help_data(ct);
        nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
        /* save original PAC call ID in nat_info */
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 746edec8b86e..bac712293fd6 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -405,7 +405,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
        ptr = *octets;
        while (ctx->pointer < eoc) {
-                if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+                if (!asn1_octet_decode(ctx, ptr++)) {
                        kfree(*octets);
                        *octets = NULL;
                        return 0;
@@ -759,7 +759,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                }
                break;
        case SNMP_OBJECTID:
-                if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+                if (!asn1_oid_decode(ctx, end, &lp, &len)) {
                        kfree(id);
                        return 0;
                }
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index a2901bf829c0..9dbb8d284f99 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -8,10 +8,10 @@
 #include <linux/module.h>
 #include <linux/udp.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter/nf_conntrack_tftp.h>
 MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2c00e8bf684d..6232d476f37e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -371,6 +371,7 @@ void ping_err(struct sk_buff *skb, u32 info)
                break;
        case ICMP_DEST_UNREACH:
                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+                        ipv4_sk_update_pmtu(skb, sk, info);
                        if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
                                err = EMSGSIZE;
                                harderr = 1;
@@ -386,6 +387,7 @@ void ping_err(struct sk_buff *skb, u32 info)
                break;
        case ICMP_REDIRECT:
                /* See ICMP_SOURCE_QUENCH */
+                ipv4_sk_redirect(skb, sk);
                err = EREMOTEIO;
                break;
        }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44e4e22..957acd12250b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -232,7 +232,6 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
        SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
        SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
-        SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
        SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
        SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
        SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -258,6 +257,12 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
        SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
        SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+        SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+        SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+        SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+        SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+        SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+        SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 9ae5c01cd0b2..8918eff1426d 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -36,9 +36,7 @@ const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-        int hash = protocol & (MAX_INET_PROTOS - 1);
+        return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
-        return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
                        NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet_add_protocol);
@@ -49,9 +47,9 @@ EXPORT_SYMBOL(inet_add_protocol);
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-        int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+        int ret;
-        ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+        ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
                       prot, NULL) == prot) ? 0 : -1;
        synchronize_net();
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4032b818f3e4..ff0f071969ea 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -216,6 +216,11 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
        int err = 0;
        int harderr = 0;
+        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+                ipv4_sk_update_pmtu(skb, sk, info);
+        else if (type == ICMP_REDIRECT)
+                ipv4_sk_redirect(skb, sk);
        /* Report error on raw socket, if:
           1. User requested ip_recverr.
           2. Socket is connected (otherwise the error indication
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 98b30d08efe9..6bcb8fc71cbc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly       = 256;
-static int rt_chain_length_max __read_mostly    = 20;
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
 /*
 *      Interface to generic destination cache.
@@ -145,11 +141,12 @@ static unsigned long expires_ljiffies;
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
-static void              ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void              ipv4_link_failure(struct sk_buff *skb);
-static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-static int rt_garbage_collect(struct dst_ops *ops);
+                                           struct sk_buff *skb, u32 mtu);
+static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+                                        struct sk_buff *skb);
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                            int how)
@@ -158,54 +155,26 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
-        struct rtable *rt = (struct rtable *) dst;
+        WARN_ON(1);
-        struct inet_peer *peer;
+        return NULL;
-        u32 *p = NULL;
-        if (!rt->peer)
-                rt_bind_peer(rt, rt->rt_dst, 1);
-        peer = rt->peer;
-        if (peer) {
-                u32 *old_p = __DST_METRICS_PTR(old);
-                unsigned long prev, new;
-                p = peer->metrics;
-                if (inet_metrics_new(peer))
-                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-                new = (unsigned long) p;
-                prev = cmpxchg(&dst->_metrics, old, new);
-                if (prev != old) {
-                        p = __DST_METRICS_PTR(prev);
-                        if (prev & DST_METRICS_READ_ONLY)
-                                p = NULL;
-                } else {
-                        if (rt->fi) {
-                                fib_info_put(rt->fi);
-                                rt->fi = NULL;
-                        }
-                }
-        }
-        return p;
 }
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+                                           struct sk_buff *skb,
+                                           const void *daddr);
 static struct dst_ops ipv4_dst_ops = {
        .family =               AF_INET,
        .protocol =             cpu_to_be16(ETH_P_IP),
-        .gc =                   rt_garbage_collect,
        .check =                ipv4_dst_check,
        .default_advmss =       ipv4_default_advmss,
        .mtu =                  ipv4_mtu,
        .cow_metrics =          ipv4_cow_metrics,
-        .destroy =              ipv4_dst_destroy,
        .ifdown =               ipv4_dst_ifdown,
        .negative_advice =      ipv4_negative_advice,
        .link_failure =         ipv4_link_failure,
        .update_pmtu =          ip_rt_update_pmtu,
+        .redirect =             ip_do_redirect,
        .local_out =            __ip_local_out,
        .neigh_lookup =         ipv4_neigh_lookup,
 };
@@ -232,184 +201,30 @@ const __u8 ip_tos2prio[16] = {
 };
 EXPORT_SYMBOL(ip_tos2prio);
-/*
- * Route cache.
- */
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- *    as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- *    they do so with atomic increments and with the
- *    lock held.
- */
-struct rt_hash_bucket {
-        struct rtable __rcu     *chain;
-};
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-        defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ        256
-#else
-# if NR_CPUS >= 32
-#  define RT_HASH_LOCK_SZ       4096
-# elif NR_CPUS >= 16
-#  define RT_HASH_LOCK_SZ       2048
-# elif NR_CPUS >= 8
-#  define RT_HASH_LOCK_SZ       1024
-# elif NR_CPUS >= 4
-#  define RT_HASH_LOCK_SZ       512
-# else
-#  define RT_HASH_LOCK_SZ       256
-# endif
-#endif
-static spinlock_t       *rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-static __init void rt_hash_lock_init(void)
-{
-        int i;
-        rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
-                        GFP_KERNEL);
-        if (!rt_hash_locks)
-                panic("IP: failed to allocate rt_hash_locks\n");
-        for (i = 0; i < RT_HASH_LOCK_SZ; i++)
-                spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-static struct rt_hash_bucket    *rt_hash_table __read_mostly;
-static unsigned int             rt_hash_mask __read_mostly;
-static unsigned int             rt_hash_log  __read_mostly;
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-                                   int genid)
-{
-        return jhash_3words((__force u32)daddr, (__force u32)saddr,
-                            idx, genid)
-                & rt_hash_mask;
-}
 static inline int rt_genid(struct net *net)
 {
        return atomic_read(&net->ipv4.rt_genid);
 }
 #ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
-        struct seq_net_private p;
-        int bucket;
-        int genid;
-};
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
-        struct rt_cache_iter_state *st = seq->private;
-        struct rtable *r = NULL;
-        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-                if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
-                        continue;
-                rcu_read_lock_bh();
-                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-                while (r) {
-                        if (dev_net(r->dst.dev) == seq_file_net(seq) &&
-                            r->rt_genid == st->genid)
-                                return r;
-                        r = rcu_dereference_bh(r->dst.rt_next);
-                }
-                rcu_read_unlock_bh();
-        }
-        return r;
-}
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
-                                          struct rtable *r)
-{
-        struct rt_cache_iter_state *st = seq->private;
-        r = rcu_dereference_bh(r->dst.rt_next);
-        while (!r) {
-                rcu_read_unlock_bh();
-                do {
-                        if (--st->bucket < 0)
-                                return NULL;
-                } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
-                rcu_read_lock_bh();
-                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-        }
-        return r;
-}
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
-                                        struct rtable *r)
-{
-        struct rt_cache_iter_state *st = seq->private;
-        while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-                if (dev_net(r->dst.dev) != seq_file_net(seq))
-                        continue;
-                if (r->rt_genid == st->genid)
-                        break;
-        }
-        return r;
-}
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
-        struct rtable *r = rt_cache_get_first(seq);
-        if (r)
-                while (pos && (r = rt_cache_get_next(seq, r)))
-                        --pos;
-        return pos ? NULL : r;
-}
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct rt_cache_iter_state *st = seq->private;
        if (*pos)
-                return rt_cache_get_idx(seq, *pos - 1);
+                return NULL;
-        st->genid = rt_genid(seq_file_net(seq));
        return SEQ_START_TOKEN;
 }
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-        struct rtable *r;
-        if (v == SEQ_START_TOKEN)
-                r = rt_cache_get_first(seq);
-        else
-                r = rt_cache_get_next(seq, v);
        ++*pos;
-        return r;
+        return NULL;
 }
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
-        if (v && v != SEQ_START_TOKEN)
-                rcu_read_unlock_bh();
 }
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -419,34 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
                           "HHUptod\tSpecDst");
-        else {
-                struct rtable *r = v;
-                struct neighbour *n;
-                int len, HHUptod;
-                rcu_read_lock();
-                n = dst_get_neighbour_noref(&r->dst);
-                HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
-                rcu_read_unlock();
-                seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-                              "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-                        r->dst.dev ? r->dst.dev->name : "*",
-                        (__force u32)r->rt_dst,
-                        (__force u32)r->rt_gateway,
-                        r->rt_flags, atomic_read(&r->dst.__refcnt),
-                        r->dst.__use, 0, (__force u32)r->rt_src,
-                        dst_metric_advmss(&r->dst) + 40,
-                        dst_metric(&r->dst, RTAX_WINDOW),
-                        (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
-                              dst_metric(&r->dst, RTAX_RTTVAR)),
-                        r->rt_key_tos,
-                        -1,
-                        HHUptod,
-                        r->rt_spec_dst, &len);
-                seq_printf(seq, "%*s\n", 127 - len, "");
-        }
        return 0;
 }
@@ -459,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = {
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-        return seq_open_net(inode, file, &rt_cache_seq_ops,
+        return seq_open(file, &rt_cache_seq_ops);
-                        sizeof(struct rt_cache_iter_state));
 }
 static const struct file_operations rt_cache_seq_fops = {
@@ -468,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = {
        .open    = rt_cache_seq_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = seq_release_net,
+        .release = seq_release,
 };
@@ -658,275 +444,12 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
-static inline void rt_free(struct rtable *rt)
-{
-        call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-static inline void rt_drop(struct rtable *rt)
-{
-        ip_rt_put(rt);
-        call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-static inline int rt_fast_clean(struct rtable *rth)
-{
-        /* Kill broadcast/multicast entries very aggresively, if they
-           collide in hash table with more useful entries */
-        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-                rt_is_input_route(rth) && rth->dst.rt_next;
-}
-static inline int rt_valuable(struct rtable *rth)
-{
-        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-                (rth->peer && rth->peer->pmtu_expires);
-}
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
-        unsigned long age;
-        int ret = 0;
-        if (atomic_read(&rth->dst.__refcnt))
-                goto out;
-        age = jiffies - rth->dst.lastuse;
-        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
-            (age <= tmo2 && rt_valuable(rth)))
-                goto out;
-        ret = 1;
-out:    return ret;
-}
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
-        u32 score = jiffies - rt->dst.lastuse;
-        score = ~score & ~(3<<30);
-        if (rt_valuable(rt))
-                score |= (1<<31);
-        if (rt_is_output_route(rt) ||
-            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
-                score |= (1<<30);
-        return score;
-}
-static inline bool rt_caching(const struct net *net)
-{
-        return net->ipv4.current_rt_cache_rebuild_count <=
-                net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-static inline bool compare_hash_inputs(const struct rtable *rt1,
-                                       const struct rtable *rt2)
-{
-        return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-                (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
-}
-static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
-{
-        return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-                (rt1->rt_mark ^ rt2->rt_mark) |
-                (rt1->rt_key_tos ^ rt2->rt_key_tos) |
-                (rt1->rt_route_iif ^ rt2->rt_route_iif) |
-                (rt1->rt_oif ^ rt2->rt_oif)) == 0;
-}
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
-        return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
 static inline int rt_is_expired(struct rtable *rth)
 {
        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 /*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
-        unsigned int i;
-        struct rtable *rth, *next;
-        for (i = 0; i <= rt_hash_mask; i++) {
-                struct rtable __rcu **pprev;
-                struct rtable *list;
-                if (process_context && need_resched())
-                        cond_resched();
-                rth = rcu_access_pointer(rt_hash_table[i].chain);
-                if (!rth)
-                        continue;
-                spin_lock_bh(rt_hash_lock_addr(i));
-                list = NULL;
-                pprev = &rt_hash_table[i].chain;
-                rth = rcu_dereference_protected(*pprev,
-                        lockdep_is_held(rt_hash_lock_addr(i)));
-                while (rth) {
-                        next = rcu_dereference_protected(rth->dst.rt_next,
-                                lockdep_is_held(rt_hash_lock_addr(i)));
-                        if (!net ||
-                            net_eq(dev_net(rth->dst.dev), net)) {
-                                rcu_assign_pointer(*pprev, next);
-                                rcu_assign_pointer(rth->dst.rt_next, list);
-                                list = rth;
-                        } else {
-                                pprev = &rth->dst.rt_next;
-                        }
-                        rth = next;
-                }
-                spin_unlock_bh(rt_hash_lock_addr(i));
-                for (; list; list = next) {
-                        next = rcu_dereference_protected(list->dst.rt_next, 1);
-                        rt_free(list);
-                }
-        }
-}
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
-        const struct rtable *aux = head;
-        while (aux != rth) {
-                if (compare_hash_inputs(aux, rth))
-                        return 0;
-                aux = rcu_dereference_protected(aux->dst.rt_next, 1);
-        }
-        return ONE;
-}
-static void rt_check_expire(void)
-{
-        static unsigned int rover;
-        unsigned int i = rover, goal;
-        struct rtable *rth;
-        struct rtable __rcu **rthp;
-        unsigned long samples = 0;
-        unsigned long sum = 0, sum2 = 0;
-        unsigned long delta;
-        u64 mult;
-        delta = jiffies - expires_ljiffies;
-        expires_ljiffies = jiffies;
-        mult = ((u64)delta) << rt_hash_log;
-        if (ip_rt_gc_timeout > 1)
-                do_div(mult, ip_rt_gc_timeout);
-        goal = (unsigned int)mult;
-        if (goal > rt_hash_mask)
-                goal = rt_hash_mask + 1;
-        for (; goal > 0; goal--) {
-                unsigned long tmo = ip_rt_gc_timeout;
-                unsigned long length;
-                i = (i + 1) & rt_hash_mask;
-                rthp = &rt_hash_table[i].chain;
-                if (need_resched())
-                        cond_resched();
-                samples++;
-                if (rcu_dereference_raw(*rthp) == NULL)
-                        continue;
-                length = 0;
-                spin_lock_bh(rt_hash_lock_addr(i));
-                while ((rth = rcu_dereference_protected(*rthp,
-                                        lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
-                        prefetch(rth->dst.rt_next);
-                        if (rt_is_expired(rth)) {
-                                *rthp = rth->dst.rt_next;
-                                rt_free(rth);
-                                continue;
-                        }
-                        if (rth->dst.expires) {
-                                /* Entry is expired even if it is in use */
-                                if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
-                                        tmo >>= 1;
-                                        rthp = &rth->dst.rt_next;
-                                        /*
-                                         * We only count entries on
-                                         * a chain with equal hash inputs once
-                                         * so that entries for different QOS
-                                         * levels, and other non-hash input
-                                         * attributes don't unfairly skew
-                                         * the length computation
-                                         */
-                                        length += has_noalias(rt_hash_table[i].chain, rth);
-                                        continue;
-                                }
-                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
-                                goto nofree;
-                        /* Cleanup aged off entries. */
-                        *rthp = rth->dst.rt_next;
-                        rt_free(rth);
-                }
-                spin_unlock_bh(rt_hash_lock_addr(i));
-                sum += length;
-                sum2 += length*length;
-        }
-        if (samples) {
-                unsigned long avg = sum / samples;
-                unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
-                rt_chain_length_max = max_t(unsigned long,
-                                        ip_rt_gc_elasticity,
-                                        (avg + 4*sd) >> FRACT_BITS);
-        }
-        rover = i;
-}
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
-        rt_check_expire();
-        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-/*
 * Perturbation of rt_genid by a small quantity [1..256]
 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 * many times (2^24) without giving recent rt_genid.
@@ -938,7 +461,6 @@ static void rt_cache_invalidate(struct net *net)
        get_random_bytes(&shuffle, sizeof(shuffle));
        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-        inetpeer_invalidate_tree(AF_INET);
 }
 /*
@@ -948,183 +470,22 @@ static void rt_cache_invalidate(struct net *net)
 void rt_cache_flush(struct net *net, int delay)
 {
        rt_cache_invalidate(net);
-        if (delay >= 0)
-                rt_do_flush(net, !in_softirq());
-}
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
-        rt_do_flush(net, !in_softirq());
-}
-static void rt_emergency_hash_rebuild(struct net *net)
-{
-        net_warn_ratelimited("Route hash chain too long!\n");
-        rt_cache_invalidate(net);
-}
-/*
-   Short description of GC goals.
-   We want to build algorithm, which will keep routing cache
-   at some equilibrium point, when number of aged off entries
-   is kept approximately equal to newly generated ones.
-   Current expiration strength is variable "expire".
-   We try to adjust it dynamically, so that if networking
-   is idle expires is large enough to keep enough of warm entries,
-   and when load increases it reduces to limit cache size.
- */
-static int rt_garbage_collect(struct dst_ops *ops)
-{
-        static unsigned long expire = RT_GC_TIMEOUT;
-        static unsigned long last_gc;
-        static int rover;
-        static int equilibrium;
-        struct rtable *rth;
-        struct rtable __rcu **rthp;
-        unsigned long now = jiffies;
-        int goal;
-        int entries = dst_entries_get_fast(&ipv4_dst_ops);
-        /*
-         * Garbage collection is pretty expensive,
-         * do not make it too frequently.
-         */
-        RT_CACHE_STAT_INC(gc_total);
-        if (now - last_gc < ip_rt_gc_min_interval &&
-            entries < ip_rt_max_size) {
-                RT_CACHE_STAT_INC(gc_ignored);
-                goto out;
-        }
-        entries = dst_entries_get_slow(&ipv4_dst_ops);
-        /* Calculate number of entries, which we want to expire now. */
-        goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-        if (goal <= 0) {
-                if (equilibrium < ipv4_dst_ops.gc_thresh)
-                        equilibrium = ipv4_dst_ops.gc_thresh;
-                goal = entries - equilibrium;
-                if (goal > 0) {
-                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-                        goal = entries - equilibrium;
-                }
-        } else {
-                /* We are in dangerous area. Try to reduce cache really
-                 * aggressively.
-                 */
-                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-                equilibrium = entries - goal;
-        }
-        if (now - last_gc >= ip_rt_gc_min_interval)
-                last_gc = now;
-        if (goal <= 0) {
-                equilibrium += goal;
-                goto work_done;
-        }
-        do {
-                int i, k;
-                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
-                        unsigned long tmo = expire;
-                        k = (k + 1) & rt_hash_mask;
-                        rthp = &rt_hash_table[k].chain;
-                        spin_lock_bh(rt_hash_lock_addr(k));
-                        while ((rth = rcu_dereference_protected(*rthp,
-                                        lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
-                                if (!rt_is_expired(rth) &&
-                                        !rt_may_expire(rth, tmo, expire)) {
-                                        tmo >>= 1;
-                                        rthp = &rth->dst.rt_next;
-                                        continue;
-                                }
-                                *rthp = rth->dst.rt_next;
-                                rt_free(rth);
-                                goal--;
-                        }
-                        spin_unlock_bh(rt_hash_lock_addr(k));
-                        if (goal <= 0)
-                                break;
-                }
-                rover = k;
-                if (goal <= 0)
-                        goto work_done;
-                /* Goal is not achieved. We stop process if:
-                   - if expire reduced to zero. Otherwise, expire is halfed.
-                   - if table is not full.
-                   - if we are called from interrupt.
-                   - jiffies check is just fallback/debug loop breaker.
-                     We will not spin here for long time in any case.
-                 */
-                RT_CACHE_STAT_INC(gc_goal_miss);
-                if (expire == 0)
-                        break;
-                expire >>= 1;
-                if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-                        goto out;
-        } while (!in_softirq() && time_before_eq(jiffies, now));
-        if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-                goto out;
-        if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
-                goto out;
-        net_warn_ratelimited("dst cache overflow\n");
-        RT_CACHE_STAT_INC(gc_dst_overflow);
-        return 1;
-work_done:
-        expire += ip_rt_gc_min_interval;
-        if (expire > ip_rt_gc_timeout ||
-            dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
-            dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
-                expire = ip_rt_gc_timeout;
-out:    return 0;
-}
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
-        int length = 0;
-        const struct rtable *rth = head;
-        while (rth) {
-                length += has_noalias(head, rth);
-                rth = rcu_dereference_protected(rth->dst.rt_next, 1);
-        }
-        return length >> FRACT_BITS;
 }
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+                                           struct sk_buff *skb,
+                                           const void *daddr)
 {
-        static const __be32 inaddr_any = 0;
        struct net_device *dev = dst->dev;
        const __be32 *pkey = daddr;
        const struct rtable *rt;
        struct neighbour *n;
        rt = (const struct rtable *) dst;
+        if (rt->rt_gateway)
-        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
-                pkey = &inaddr_any;
-        else if (rt->rt_gateway)
                pkey = (const __be32 *) &rt->rt_gateway;
+        else if (skb)
+                pkey = &ip_hdr(skb)->daddr;
        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
        if (n)
@@ -1132,212 +493,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const vo
        return neigh_create(&arp_tbl, pkey, dev);
 }
-static int rt_bind_neighbour(struct rtable *rt)
-{
-        struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
-        if (IS_ERR(n))
-                return PTR_ERR(n);
-        dst_set_neighbour(&rt->dst, n);
-        return 0;
-}
-static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
-                                     struct sk_buff *skb, int ifindex)
-{
-        struct rtable   *rth, *cand;
-        struct rtable __rcu **rthp, **candp;
-        unsigned long   now;
-        u32             min_score;
-        int             chain_length;
-        int attempts = !in_softirq();
-restart:
-        chain_length = 0;
-        min_score = ~(u32)0;
-        cand = NULL;
-        candp = NULL;
-        now = jiffies;
-        if (!rt_caching(dev_net(rt->dst.dev))) {
-                /*
-                 * If we're not caching, just tell the caller we
-                 * were successful and don't touch the route.  The
-                 * caller hold the sole reference to the cache entry, and
-                 * it will be released when the caller is done with it.
-                 * If we drop it here, the callers have no way to resolve routes
-                 * when we're not caching.  Instead, just point *rp at rt, so
-                 * the caller gets a single use out of the route
-                 * Note that we do rt_free on this new route entry, so that
-                 * once its refcount hits zero, we are still able to reap it
-                 * (Thanks Alexey)
-                 * Note: To avoid expensive rcu stuff for this uncached dst,
-                 * we set DST_NOCACHE so that dst_release() can free dst without
-                 * waiting a grace period.
-                 */
-                rt->dst.flags |= DST_NOCACHE;
-                if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
-                        int err = rt_bind_neighbour(rt);
-                        if (err) {
-                                net_warn_ratelimited("Neighbour table failure & not caching routes\n");
-                                ip_rt_put(rt);
-                                return ERR_PTR(err);
-                        }
-                }
-                goto skip_hashing;
-        }
-        rthp = &rt_hash_table[hash].chain;
-        spin_lock_bh(rt_hash_lock_addr(hash));
-        while ((rth = rcu_dereference_protected(*rthp,
-                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-                if (rt_is_expired(rth)) {
-                        *rthp = rth->dst.rt_next;
-                        rt_free(rth);
-                        continue;
-                }
-                if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
-                        /* Put it first */
-                        *rthp = rth->dst.rt_next;
-                        /*
-                         * Since lookup is lockfree, the deletion
-                         * must be visible to another weakly ordered CPU before
-                         * the insertion at the start of the hash chain.
-                         */
-                        rcu_assign_pointer(rth->dst.rt_next,
-                                           rt_hash_table[hash].chain);
-                        /*
-                         * Since lookup is lockfree, the update writes
-                         * must be ordered for consistency on SMP.
-                         */
-                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-                        dst_use(&rth->dst, now);
-                        spin_unlock_bh(rt_hash_lock_addr(hash));
-                        rt_drop(rt);
-                        if (skb)
-                                skb_dst_set(skb, &rth->dst);
-                        return rth;
-                }
-                if (!atomic_read(&rth->dst.__refcnt)) {
-                        u32 score = rt_score(rth);
-                        if (score <= min_score) {
-                                cand = rth;
-                                candp = rthp;
-                                min_score = score;
-                        }
-                }
-                chain_length++;
-                rthp = &rth->dst.rt_next;
-        }
-        if (cand) {
-                /* ip_rt_gc_elasticity used to be average length of chain
-                 * length, when exceeded gc becomes really aggressive.
-                 *
-                 * The second limit is less certain. At the moment it allows
-                 * only 2 entries per bucket. We will see.
-                 */
-                if (chain_length > ip_rt_gc_elasticity) {
-                        *candp = cand->dst.rt_next;
-                        rt_free(cand);
-                }
-        } else {
-                if (chain_length > rt_chain_length_max &&
-                    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-                        struct net *net = dev_net(rt->dst.dev);
-                        int num = ++net->ipv4.current_rt_cache_rebuild_count;
-                        if (!rt_caching(net)) {
-                                pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
-                                        rt->dst.dev->name, num);
-                        }
-                        rt_emergency_hash_rebuild(net);
-                        spin_unlock_bh(rt_hash_lock_addr(hash));
-                        hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-                                        ifindex, rt_genid(net));
-                        goto restart;
-                }
-        }
-        /* Try to bind route to arp only if it is output
-           route or unicast forwarding path.
-         */
-        if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
-                int err = rt_bind_neighbour(rt);
-                if (err) {
-                        spin_unlock_bh(rt_hash_lock_addr(hash));
-                        if (err != -ENOBUFS) {
-                                rt_drop(rt);
-                                return ERR_PTR(err);
-                        }
-                        /* Neighbour tables are full and nothing
-                           can be released. Try to shrink route cache,
-                           it is most likely it holds some neighbour records.
-                         */
-                        if (attempts-- > 0) {
-                                int saved_elasticity = ip_rt_gc_elasticity;
-                                int saved_int = ip_rt_gc_min_interval;
-                                ip_rt_gc_elasticity     = 1;
-                                ip_rt_gc_min_interval   = 0;
-                                rt_garbage_collect(&ipv4_dst_ops);
-                                ip_rt_gc_min_interval   = saved_int;
-                                ip_rt_gc_elasticity     = saved_elasticity;
-                                goto restart;
-                        }
-                        net_warn_ratelimited("Neighbour table overflow\n");
-                        rt_drop(rt);
-                        return ERR_PTR(-ENOBUFS);
-                }
-        }
-        rt->dst.rt_next = rt_hash_table[hash].chain;
-        /*
-         * Since lookup is lockfree, we must make sure
-         * previous writes to rt are committed to memory
-         * before making rt visible to other CPUS.
-         */
-        rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-        spin_unlock_bh(rt_hash_lock_addr(hash));
-skip_hashing:
-        if (skb)
-                skb_dst_set(skb, &rt->dst);
-        return rt;
-}
-static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
-static u32 rt_peer_genid(void)
-{
-        return atomic_read(&__rt_peer_genid);
-}
-void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
-{
-        struct inet_peer *peer;
-        peer = inet_getpeer_v4(daddr, create);
-        if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
-                inet_putpeer(peer);
-        else
-                rt->rt_peer_genid = rt_peer_genid();
-}
 /*
 * Peer allocation may fail only in serious out-of-memory conditions.  However
 * we still can generate some output.
@@ -1360,83 +515,188 @@ static void ip_select_fb_ident(struct iphdr *iph)
 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 {
-        struct rtable *rt = (struct rtable *) dst;
+        struct net *net = dev_net(dst->dev);
+        struct inet_peer *peer;
-        if (rt && !(rt->dst.flags & DST_NOPEER)) {
-                if (rt->peer == NULL)
-                        rt_bind_peer(rt, rt->rt_dst, 1);
-                /* If peer is attached to destination, it is never detached,
+        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
-                   so that we need not to grab a lock to dereference it.
+        if (peer) {
-                 */
+                iph->id = htons(inet_getid(peer, more));
-                if (rt->peer) {
+                inet_putpeer(peer);
-                        iph->id = htons(inet_getid(rt->peer, more));
+                return;
-                        return;
+        }
-                }
-        } else if (!rt)
-                pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
        ip_select_fb_ident(iph);
 }
 EXPORT_SYMBOL(__ip_select_ident);
-static void rt_del(unsigned int hash, struct rtable *rt)
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+                             const struct iphdr *iph,
+                             int oif, u8 tos,
+                             u8 prot, u32 mark, int flow_flags)
 {
-        struct rtable __rcu **rthp;
+        if (sk) {
-        struct rtable *aux;
+                const struct inet_sock *inet = inet_sk(sk);
-        rthp = &rt_hash_table[hash].chain;
+                oif = sk->sk_bound_dev_if;
-        spin_lock_bh(rt_hash_lock_addr(hash));
+                mark = sk->sk_mark;
-        ip_rt_put(rt);
+                tos = RT_CONN_FLAGS(sk);
-        while ((aux = rcu_dereference_protected(*rthp,
+                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
-                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-                if (aux == rt || rt_is_expired(aux)) {
-                        *rthp = aux->dst.rt_next;
-                        rt_free(aux);
-                        continue;
-                }
-                rthp = &aux->dst.rt_next;
        }
-        spin_unlock_bh(rt_hash_lock_addr(hash));
+        flowi4_init_output(fl4, oif, mark, tos,
+                           RT_SCOPE_UNIVERSE, prot,
+                           flow_flags,
+                           iph->daddr, iph->saddr, 0, 0);
 }
-static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+                               const struct sock *sk)
 {
-        struct rtable *rt = (struct rtable *) dst;
+        const struct iphdr *iph = ip_hdr(skb);
-        __be32 orig_gw = rt->rt_gateway;
+        int oif = skb->dev->ifindex;
-        struct neighbour *n, *old_n;
+        u8 tos = RT_TOS(iph->tos);
+        u8 prot = iph->protocol;
+        u32 mark = skb->mark;
-        dst_confirm(&rt->dst);
+        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
-        rt->rt_gateway = peer->redirect_learned.a4;
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+{
+        const struct inet_sock *inet = inet_sk(sk);
+        const struct ip_options_rcu *inet_opt;
+        __be32 daddr = inet->inet_daddr;
-        n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+        rcu_read_lock();
-        if (IS_ERR(n)) {
+        inet_opt = rcu_dereference(inet->inet_opt);
-                rt->rt_gateway = orig_gw;
+        if (inet_opt && inet_opt->opt.srr)
-                return;
+                daddr = inet_opt->opt.faddr;
+        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+                           inet_sk_flowi_flags(sk),
+                           daddr, inet->inet_saddr, 0, 0);
+        rcu_read_unlock();
+}
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+                                 const struct sk_buff *skb)
+{
+        if (skb)
+                build_skb_flow_key(fl4, skb, sk);
+        else
+                build_sk_flow_key(fl4, sk);
+}
+static DEFINE_SEQLOCK(fnhe_seqlock);
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+{
+        struct fib_nh_exception *fnhe, *oldest;
+        oldest = rcu_dereference(hash->chain);
+        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+             fnhe = rcu_dereference(fnhe->fnhe_next)) {
+                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+                        oldest = fnhe;
        }
-        old_n = xchg(&rt->dst._neighbour, n);
+        return oldest;
-        if (old_n)
+}
-                neigh_release(old_n);
-        if (!(n->nud_state & NUD_VALID)) {
+static inline u32 fnhe_hashfun(__be32 daddr)
-                neigh_event_send(n, NULL);
+{
+        u32 hval;
+        hval = (__force u32) daddr;
+        hval ^= (hval >> 11) ^ (hval >> 22);
+        return hval & (FNHE_HASH_SIZE - 1);
+}
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+                                  u32 pmtu, unsigned long expires)
+{
+        struct fnhe_hash_bucket *hash;
+        struct fib_nh_exception *fnhe;
+        int depth;
+        u32 hval = fnhe_hashfun(daddr);
+        write_seqlock_bh(&fnhe_seqlock);
+        hash = nh->nh_exceptions;
+        if (!hash) {
+                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
+                if (!hash)
+                        goto out_unlock;
+                nh->nh_exceptions = hash;
+        }
+        hash += hval;
+        depth = 0;
+        for (fnhe = rcu_dereference(hash->chain); fnhe;
+             fnhe = rcu_dereference(fnhe->fnhe_next)) {
+                if (fnhe->fnhe_daddr == daddr)
+                        break;
+                depth++;
+        }
+        if (fnhe) {
+                if (gw)
+                        fnhe->fnhe_gw = gw;
+                if (pmtu) {
+                        fnhe->fnhe_pmtu = pmtu;
+                        fnhe->fnhe_expires = expires;
+                }
        } else {
-                rt->rt_flags |= RTCF_REDIRECTED;
+                if (depth > FNHE_RECLAIM_DEPTH)
-                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+                        fnhe = fnhe_oldest(hash);
+                else {
+                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+                        if (!fnhe)
+                                goto out_unlock;
+                        fnhe->fnhe_next = hash->chain;
+                        rcu_assign_pointer(hash->chain, fnhe);
+                }
+                fnhe->fnhe_daddr = daddr;
+                fnhe->fnhe_gw = gw;
+                fnhe->fnhe_pmtu = pmtu;
+                fnhe->fnhe_expires = expires;
        }
+        fnhe->fnhe_stamp = jiffies;
+out_unlock:
+        write_sequnlock_bh(&fnhe_seqlock);
+        return;
 }
-/* called in rcu_read_lock() section */
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
-void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
+                             bool kill_route)
-                    __be32 saddr, struct net_device *dev)
 {
-        int s, i;
+        __be32 new_gw = icmp_hdr(skb)->un.gateway;
-        struct in_device *in_dev = __in_dev_get_rcu(dev);
+        __be32 old_gw = ip_hdr(skb)->saddr;
-        __be32 skeys[2] = { saddr, 0 };
+        struct net_device *dev = skb->dev;
-        int    ikeys[2] = { dev->ifindex, 0 };
+        struct in_device *in_dev;
-        struct inet_peer *peer;
+        struct fib_result res;
+        struct neighbour *n;
        struct net *net;
+        switch (icmp_hdr(skb)->code & 7) {
+        case ICMP_REDIR_NET:
+        case ICMP_REDIR_NETTOS:
+        case ICMP_REDIR_HOST:
+        case ICMP_REDIR_HOSTTOS:
+                break;
+        default:
+                return;
+        }
+        if (rt->rt_gateway != old_gw)
+                return;
+        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                return;
@@ -1456,72 +716,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                        goto reject_redirect;
        }
-        for (s = 0; s < 2; s++) {
+        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
-                for (i = 0; i < 2; i++) {
+        if (n) {
-                        unsigned int hash;
+                if (!(n->nud_state & NUD_VALID)) {
-                        struct rtable __rcu **rthp;
+                        neigh_event_send(n, NULL);
-                        struct rtable *rt;
+                } else {
+                        if (fib_lookup(net, fl4, &res) == 0) {
-                        hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
+                                struct fib_nh *nh = &FIB_RES_NH(res);
-                        rthp = &rt_hash_table[hash].chain;
+                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
+                                                      0, 0);
-                        while ((rt = rcu_dereference(*rthp)) != NULL) {
-                                rthp = &rt->dst.rt_next;
-                                if (rt->rt_key_dst != daddr ||
-                                    rt->rt_key_src != skeys[s] ||
-                                    rt->rt_oif != ikeys[i] ||
-                                    rt_is_input_route(rt) ||
-                                    rt_is_expired(rt) ||
-                                    !net_eq(dev_net(rt->dst.dev), net) ||
-                                    rt->dst.error ||
-                                    rt->dst.dev != dev ||
-                                    rt->rt_gateway != old_gw)
-                                        continue;
-                                if (!rt->peer)
-                                        rt_bind_peer(rt, rt->rt_dst, 1);
-                                peer = rt->peer;
-                                if (peer) {
-                                        if (peer->redirect_learned.a4 != new_gw) {
-                                                peer->redirect_learned.a4 = new_gw;
-                                                atomic_inc(&__rt_peer_genid);
-                                        }
-                                        check_peer_redir(&rt->dst, peer);
-                                }
                        }
+                        if (kill_route)
+                                rt->dst.obsolete = DST_OBSOLETE_KILL;
+                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
                }
+                neigh_release(n);
        }
        return;
 reject_redirect:
 #ifdef CONFIG_IP_ROUTE_VERBOSE
-        if (IN_DEV_LOG_MARTIANS(in_dev))
+        if (IN_DEV_LOG_MARTIANS(in_dev)) {
+                const struct iphdr *iph = (const struct iphdr *) skb->data;
+                __be32 daddr = iph->daddr;
+                __be32 saddr = iph->saddr;
                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
                                     "  Advised path = %pI4 -> %pI4\n",
                                     &old_gw, dev->name, &new_gw,
                                     &saddr, &daddr);
+        }
 #endif
        ;
 }
-static bool peer_pmtu_expired(struct inet_peer *peer)
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 {
-        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
+        struct rtable *rt;
+        struct flowi4 fl4;
-        return orig &&
+        rt = (struct rtable *) dst;
-               time_after_eq(jiffies, orig) &&
-               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-static bool peer_pmtu_cleaned(struct inet_peer *peer)
+        ip_rt_build_flow_key(&fl4, sk, skb);
-{
+        __ip_do_redirect(rt, skb, &fl4, true);
-        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-        return orig &&
-               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
 }
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1533,14 +771,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
                if (dst->obsolete > 0) {
                        ip_rt_put(rt);
                        ret = NULL;
-                } else if (rt->rt_flags & RTCF_REDIRECTED) {
+                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
-                        unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+                           rt->dst.expires) {
-                                                rt->rt_oif,
+                        ip_rt_put(rt);
-                                                rt_genid(dev_net(dst->dev)));
-                        rt_del(hash, rt);
                        ret = NULL;
-                } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
-                        dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
                }
        }
        return ret;
@@ -1567,6 +801,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        struct rtable *rt = skb_rtable(skb);
        struct in_device *in_dev;
        struct inet_peer *peer;
+        struct net *net;
        int log_martians;
        rcu_read_lock();
@@ -1578,9 +813,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
        rcu_read_unlock();
-        if (!rt->peer)
+        net = dev_net(rt->dst.dev);
-                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
-        peer = rt->peer;
        if (!peer) {
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
                return;
@@ -1597,7 +831,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
         */
        if (peer->rate_tokens >= ip_rt_redirect_number) {
                peer->rate_last = jiffies;
-                return;
+                goto out_put_peer;
        }
        /* Check for load limit; set rate_last to the latest sent
@@ -1614,20 +848,38 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                if (log_martians &&
                    peer->rate_tokens == ip_rt_redirect_number)
                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
-                                             &ip_hdr(skb)->saddr, rt->rt_iif,
+                                             &ip_hdr(skb)->saddr, inet_iif(skb),
-                                             &rt->rt_dst, &rt->rt_gateway);
+                                             &ip_hdr(skb)->daddr, &rt->rt_gateway);
 #endif
        }
+out_put_peer:
+        inet_putpeer(peer);
 }
 static int ip_error(struct sk_buff *skb)
 {
+        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
        struct rtable *rt = skb_rtable(skb);
        struct inet_peer *peer;
        unsigned long now;
+        struct net *net;
        bool send;
        int code;
+        net = dev_net(rt->dst.dev);
+        if (!IN_DEV_FORWARD(in_dev)) {
+                switch (rt->dst.error) {
+                case EHOSTUNREACH:
+                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+                        break;
+                case ENETUNREACH:
+                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+                        break;
+                }
+                goto out;
+        }
        switch (rt->dst.error) {
        case EINVAL:
        default:
@@ -1637,17 +889,14 @@ static int ip_error(struct sk_buff *skb)
                break;
        case ENETUNREACH:
                code = ICMP_NET_UNREACH;
-                IP_INC_STATS_BH(dev_net(rt->dst.dev),
+                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
-                                IPSTATS_MIB_INNOROUTES);
                break;
        case EACCES:
                code = ICMP_PKT_FILTERED;
                break;
        }
-        if (!rt->peer)
+        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
-                rt_bind_peer(rt, rt->rt_dst, 1);
-        peer = rt->peer;
        send = true;
        if (peer) {
@@ -1660,6 +909,7 @@ static int ip_error(struct sk_buff *skb)
                        peer->rate_tokens -= ip_rt_error_cost;
                else
                        send = false;
+                inet_putpeer(peer);
        }
        if (send)
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1668,163 +918,120 @@ out:	kfree_skb(skb);
        return 0;
 }
-/*
+static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
- *      The last two values are not from the RFC but
+{
- *      are needed for AMPRnet AX.25 paths.
+        struct fib_result res;
- */
-static const unsigned short mtu_plateau[] =
+        if (mtu < ip_rt_min_pmtu)
-{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+                mtu = ip_rt_min_pmtu;
-static inline unsigned short guess_mtu(unsigned short old_mtu)
+        if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
-{
+                struct fib_nh *nh = &FIB_RES_NH(res);
-        int i;
-        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
+                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
-                if (old_mtu > mtu_plateau[i])
+                                      jiffies + ip_rt_mtu_expires);
-                        return mtu_plateau[i];
+        }
-        return 68;
+        return mtu;
 }
-unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                 unsigned short new_mtu,
+                              struct sk_buff *skb, u32 mtu)
-                                 struct net_device *dev)
 {
-        unsigned short old_mtu = ntohs(iph->tot_len);
+        struct rtable *rt = (struct rtable *) dst;
-        unsigned short est_mtu = 0;
+        struct flowi4 fl4;
-        struct inet_peer *peer;
-        peer = inet_getpeer_v4(iph->daddr, 1);
-        if (peer) {
-                unsigned short mtu = new_mtu;
-                if (new_mtu < 68 || new_mtu >= old_mtu) {
-                        /* BSD 4.2 derived systems incorrectly adjust
-                         * tot_len by the IP header length, and report
-                         * a zero MTU in the ICMP message.
-                         */
-                        if (mtu == 0 &&
-                            old_mtu >= 68 + (iph->ihl << 2))
-                                old_mtu -= iph->ihl << 2;
-                        mtu = guess_mtu(old_mtu);
-                }
-                if (mtu < ip_rt_min_pmtu)
-                        mtu = ip_rt_min_pmtu;
-                if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
-                        unsigned long pmtu_expires;
-                        pmtu_expires = jiffies + ip_rt_mtu_expires;
-                        if (!pmtu_expires)
-                                pmtu_expires = 1UL;
-                        est_mtu = mtu;
+        ip_rt_build_flow_key(&fl4, sk, skb);
-                        peer->pmtu_learned = mtu;
+        mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
-                        peer->pmtu_expires = pmtu_expires;
-                        atomic_inc(&__rt_peer_genid);
-                }
-                inet_putpeer(peer);
+        if (!rt->rt_pmtu) {
+                dst->obsolete = DST_OBSOLETE_KILL;
+        } else {
+                rt->rt_pmtu = mtu;
+                dst_set_expires(&rt->dst, ip_rt_mtu_expires);
        }
-        return est_mtu ? : new_mtu;
 }
-static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+                      int oif, u32 mark, u8 protocol, int flow_flags)
 {
-        unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
+        const struct iphdr *iph = (const struct iphdr *) skb->data;
+        struct flowi4 fl4;
+        struct rtable *rt;
-        if (!expires)
+        __build_flow_key(&fl4, NULL, iph, oif,
-                return;
+                         RT_TOS(iph->tos), protocol, mark, flow_flags);
-        if (time_before(jiffies, expires)) {
+        rt = __ip_route_output_key(net, &fl4);
-                u32 orig_dst_mtu = dst_mtu(dst);
+        if (!IS_ERR(rt)) {
-                if (peer->pmtu_learned < orig_dst_mtu) {
+                __ip_rt_update_pmtu(rt, &fl4, mtu);
-                        if (!peer->pmtu_orig)
+                ip_rt_put(rt);
-                                peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+        }
-                        dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
-                }
-        } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
-                dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
 }
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 {
-        struct rtable *rt = (struct rtable *) dst;
+        const struct iphdr *iph = (const struct iphdr *) skb->data;
-        struct inet_peer *peer;
+        struct flowi4 fl4;
+        struct rtable *rt;
-        dst_confirm(dst);
-        if (!rt->peer)
-                rt_bind_peer(rt, rt->rt_dst, 1);
-        peer = rt->peer;
-        if (peer) {
-                unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
-                if (mtu < ip_rt_min_pmtu)
-                        mtu = ip_rt_min_pmtu;
-                if (!pmtu_expires || mtu < peer->pmtu_learned) {
-                        pmtu_expires = jiffies + ip_rt_mtu_expires;
-                        if (!pmtu_expires)
-                                pmtu_expires = 1UL;
-                        peer->pmtu_learned = mtu;
-                        peer->pmtu_expires = pmtu_expires;
-                        atomic_inc(&__rt_peer_genid);
+        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
-                        rt->rt_peer_genid = rt_peer_genid();
+        rt = __ip_route_output_key(sock_net(sk), &fl4);
-                }
+        if (!IS_ERR(rt)) {
-                check_peer_pmtu(dst, peer);
+                __ip_rt_update_pmtu(rt, &fl4, mtu);
+                ip_rt_put(rt);
        }
 }
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
-static void ipv4_validate_peer(struct rtable *rt)
+                   int oif, u32 mark, u8 protocol, int flow_flags)
 {
-        if (rt->rt_peer_genid != rt_peer_genid()) {
+        const struct iphdr *iph = (const struct iphdr *) skb->data;
-                struct inet_peer *peer;
+        struct flowi4 fl4;
+        struct rtable *rt;
-                if (!rt->peer)
-                        rt_bind_peer(rt, rt->rt_dst, 0);
-                peer = rt->peer;
+        __build_flow_key(&fl4, NULL, iph, oif,
-                if (peer) {
+                         RT_TOS(iph->tos), protocol, mark, flow_flags);
-                        check_peer_pmtu(&rt->dst, peer);
+        rt = __ip_route_output_key(net, &fl4);
+        if (!IS_ERR(rt)) {
+                __ip_do_redirect(rt, skb, &fl4, false);
+                ip_rt_put(rt);
+        }
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
-                        if (peer->redirect_learned.a4 &&
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
-                            peer->redirect_learned.a4 != rt->rt_gateway)
+{
-                                check_peer_redir(&rt->dst, peer);
+        const struct iphdr *iph = (const struct iphdr *) skb->data;
-                }
+        struct flowi4 fl4;
+        struct rtable *rt;
-                rt->rt_peer_genid = rt_peer_genid();
+        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+        rt = __ip_route_output_key(sock_net(sk), &fl4);
+        if (!IS_ERR(rt)) {
+                __ip_do_redirect(rt, skb, &fl4, false);
+                ip_rt_put(rt);
        }
 }
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
        struct rtable *rt = (struct rtable *) dst;
-        if (rt_is_expired(rt))
+        /* All IPV4 dsts are created with ->obsolete set to the value
+         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+         * into this function always.
+         *
+         * When a PMTU/redirect information update invalidates a
+         * route, this is indicated by setting obsolete to
+         * DST_OBSOLETE_KILL.
+         */
+        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
                return NULL;
-        ipv4_validate_peer(rt);
        return dst;
 }
-static void ipv4_dst_destroy(struct dst_entry *dst)
-{
-        struct rtable *rt = (struct rtable *) dst;
-        struct inet_peer *peer = rt->peer;
-        if (rt->fi) {
-                fib_info_put(rt->fi);
-                rt->fi = NULL;
-        }
-        if (peer) {
-                rt->peer = NULL;
-                inet_putpeer(peer);
-        }
-}
 static void ipv4_link_failure(struct sk_buff *skb)
 {
        struct rtable *rt;
@@ -1832,8 +1039,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
        rt = skb_rtable(skb);
-        if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
+        if (rt)
-                dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+                dst_set_expires(&rt->dst, 0);
 }
 static int ip_rt_bug(struct sk_buff *skb)
@@ -1880,8 +1087,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
                else
-                        src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+                        src = inet_select_addr(rt->dst.dev,
-                                        RT_SCOPE_UNIVERSE);
+                                               rt_nexthop(rt, iph->daddr),
+                                               RT_SCOPE_UNIVERSE);
                rcu_read_unlock();
        }
        memcpy(addr, &src, 4);
@@ -1913,7 +1121,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
 static unsigned int ipv4_mtu(const struct dst_entry *dst)
 {
        const struct rtable *rt = (const struct rtable *) dst;
-        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+        unsigned int mtu = rt->rt_pmtu;
+        if (mtu && time_after_eq(jiffies, rt->dst.expires))
+                mtu = 0;
+        if (!mtu)
+                mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu && rt_is_output_route(rt))
                return mtu;
@@ -1921,8 +1135,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        mtu = dst->dev->mtu;
        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+                if (rt->rt_gateway && mtu > 576)
-                if (rt->rt_gateway != rt->rt_dst && mtu > 576)
                        mtu = 576;
        }
@@ -1932,76 +1145,121 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        return mtu;
 }
-static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
-                            struct fib_info *fi)
 {
-        struct inet_peer *peer;
+        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
-        int create = 0;
+        struct fib_nh_exception *fnhe;
+        u32 hval;
-        /* If a peer entry exists for this destination, we must hook
+        if (!hash)
-         * it up in order to get at cached metrics.
+                return NULL;
-         */
-        if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
-                create = 1;
-        rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
+        hval = fnhe_hashfun(daddr);
-        if (peer) {
-                rt->rt_peer_genid = rt_peer_genid();
+        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
-                if (inet_metrics_new(peer))
+             fnhe = rcu_dereference(fnhe->fnhe_next)) {
-                        memcpy(peer->metrics, fi->fib_metrics,
+                if (fnhe->fnhe_daddr == daddr)
-                               sizeof(u32) * RTAX_MAX);
+                        return fnhe;
-                dst_init_metrics(&rt->dst, peer->metrics, false);
+        }
+        return NULL;
-                check_peer_pmtu(&rt->dst, peer);
+}
-                if (peer->redirect_learned.a4 &&
+static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
-                    peer->redirect_learned.a4 != rt->rt_gateway) {
+                              __be32 daddr)
-                        rt->rt_gateway = peer->redirect_learned.a4;
+{
-                        rt->rt_flags |= RTCF_REDIRECTED;
+        __be32 fnhe_daddr, gw;
-                }
+        unsigned long expires;
-        } else {
+        unsigned int seq;
-                if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+        u32 pmtu;
-                        rt->fi = fi;
-                        atomic_inc(&fi->fib_clntref);
+restart:
+        seq = read_seqbegin(&fnhe_seqlock);
+        fnhe_daddr = fnhe->fnhe_daddr;
+        gw = fnhe->fnhe_gw;
+        pmtu = fnhe->fnhe_pmtu;
+        expires = fnhe->fnhe_expires;
+        if (read_seqretry(&fnhe_seqlock, seq))
+                goto restart;
+        if (daddr != fnhe_daddr)
+                return;
+        if (pmtu) {
+                unsigned long diff = expires - jiffies;
+                if (time_before(jiffies, expires)) {
+                        rt->rt_pmtu = pmtu;
+                        dst_set_expires(&rt->dst, diff);
                }
-                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
        }
+        if (gw) {
+                rt->rt_flags |= RTCF_REDIRECTED;
+                rt->rt_gateway = gw;
+        }
+        fnhe->fnhe_stamp = jiffies;
+}
+static inline void rt_release_rcu(struct rcu_head *head)
+{
+        struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+        dst_release(dst);
+}
+static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+        struct rtable *orig, *prev, **p = &nh->nh_rth_output;
+        if (rt_is_input_route(rt))
+                p = &nh->nh_rth_input;
+        orig = *p;
+        prev = cmpxchg(p, orig, rt);
+        if (prev == orig) {
+                dst_clone(&rt->dst);
+                if (orig)
+                        call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
+        }
+}
+static bool rt_cache_valid(struct rtable *rt)
+{
+        return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
 }
-static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
+                           struct fib_nh_exception *fnhe,
                           struct fib_info *fi, u16 type, u32 itag)
 {
-        struct dst_entry *dst = &rt->dst;
        if (fi) {
-                if (FIB_RES_GW(*res) &&
+                struct fib_nh *nh = &FIB_RES_NH(*res);
-                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-                        rt->rt_gateway = FIB_RES_GW(*res);
+                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
-                rt_init_metrics(rt, fl4, fi);
+                        rt->rt_gateway = nh->nh_gw;
+                if (unlikely(fnhe))
+                        rt_bind_exception(rt, fnhe, daddr);
+                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-                dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
+                rt->dst.tclassid = nh->nh_tclassid;
 #endif
+                if (!(rt->dst.flags & DST_HOST))
+                        rt_cache_route(nh, rt);
        }
-        if (dst_mtu(dst) > IP_MAX_MTU)
-                dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
-        if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
-                dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-        set_class_tag(rt, fib_rules_tclass(res));
+        set_class_tag(rt, res->tclassid);
 #endif
        set_class_tag(rt, itag);
 #endif
 }
 static struct rtable *rt_dst_alloc(struct net_device *dev,
-                                   bool nopolicy, bool noxfrm)
+                                   bool nopolicy, bool noxfrm, bool will_cache)
 {
-        return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
+        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
-                         DST_HOST |
+                         (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
                         (nopolicy ? DST_NOPOLICY : 0) |
                         (noxfrm ? DST_NOXFRM : 0));
 }
@@ -2010,9 +1268,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                                u8 tos, struct net_device *dev, int our)
 {
-        unsigned int hash;
        struct rtable *rth;
-        __be32 spec_dst;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        u32 itag = 0;
        int err;
@@ -2023,21 +1279,24 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                return -EINVAL;
        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
-            ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
+            skb->protocol != htons(ETH_P_IP))
                goto e_inval;
+        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+                if (ipv4_is_loopback(saddr))
+                        goto e_inval;
        if (ipv4_is_zeronet(saddr)) {
                if (!ipv4_is_local_multicast(daddr))
                        goto e_inval;
-                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
        } else {
-                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
+                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
-                                          &itag);
+                                          in_dev, &itag);
                if (err < 0)
                        goto e_err;
        }
        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
-                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
        if (!rth)
                goto e_nobufs;
@@ -2046,23 +1305,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
        rth->dst.output = ip_rt_bug;
-        rth->rt_key_dst = daddr;
-        rth->rt_key_src = saddr;
        rth->rt_genid   = rt_genid(dev_net(dev));
        rth->rt_flags   = RTCF_MULTICAST;
        rth->rt_type    = RTN_MULTICAST;
-        rth->rt_key_tos = tos;
+        rth->rt_is_input= 1;
-        rth->rt_dst     = daddr;
+        rth->rt_iif     = 0;
-        rth->rt_src     = saddr;
+        rth->rt_pmtu    = 0;
-        rth->rt_route_iif = dev->ifindex;
+        rth->rt_gateway = 0;
-        rth->rt_iif     = dev->ifindex;
-        rth->rt_oif     = 0;
-        rth->rt_mark    = skb->mark;
-        rth->rt_gateway = daddr;
-        rth->rt_spec_dst= spec_dst;
-        rth->rt_peer_genid = 0;
-        rth->peer = NULL;
-        rth->fi = NULL;
        if (our) {
                rth->dst.input= ip_local_deliver;
                rth->rt_flags |= RTCF_LOCAL;
@@ -2074,9 +1323,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
        RT_CACHE_STAT_INC(in_slow_mc);
-        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
+        skb_dst_set(skb, &rth->dst);
-        rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
+        return 0;
-        return IS_ERR(rth) ? PTR_ERR(rth) : 0;
 e_nobufs:
        return -ENOBUFS;
@@ -2123,7 +1371,7 @@ static int __mkroute_input(struct sk_buff *skb,
        int err;
        struct in_device *out_dev;
        unsigned int flags = 0;
-        __be32 spec_dst;
+        bool do_cache;
        u32 itag;
        /* get a working reference to the output device */
@@ -2135,7 +1383,7 @@ static int __mkroute_input(struct sk_buff *skb,
        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
-                                  in_dev->dev, &spec_dst, &itag);
+                                  in_dev->dev, in_dev, &itag);
        if (err < 0) {
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                                         saddr);
@@ -2143,9 +1391,6 @@ static int __mkroute_input(struct sk_buff *skb,
                goto cleanup;
        }
-        if (err)
-                flags |= RTCF_DIRECTSRC;
        if (out_dev == in_dev && err &&
            (IN_DEV_SHARED_MEDIA(out_dev) ||
             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
@@ -2166,37 +1411,39 @@ static int __mkroute_input(struct sk_buff *skb,
                }
        }
+        do_cache = false;
+        if (res->fi) {
+                if (!itag) {
+                        rth = FIB_RES_NH(*res).nh_rth_input;
+                        if (rt_cache_valid(rth)) {
+                                dst_hold(&rth->dst);
+                                goto out;
+                        }
+                        do_cache = true;
+                }
+        }
        rth = rt_dst_alloc(out_dev->dev,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
-                           IN_DEV_CONF_GET(out_dev, NOXFRM));
+                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
        if (!rth) {
                err = -ENOBUFS;
                goto cleanup;
        }
-        rth->rt_key_dst = daddr;
-        rth->rt_key_src = saddr;
        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
        rth->rt_flags = flags;
        rth->rt_type = res->type;
-        rth->rt_key_tos = tos;
+        rth->rt_is_input = 1;
-        rth->rt_dst     = daddr;
+        rth->rt_iif     = 0;
-        rth->rt_src     = saddr;
+        rth->rt_pmtu    = 0;
-        rth->rt_route_iif = in_dev->dev->ifindex;
+        rth->rt_gateway = 0;
-        rth->rt_iif     = in_dev->dev->ifindex;
-        rth->rt_oif     = 0;
-        rth->rt_mark    = skb->mark;
-        rth->rt_gateway = daddr;
-        rth->rt_spec_dst= spec_dst;
-        rth->rt_peer_genid = 0;
-        rth->peer = NULL;
-        rth->fi = NULL;
        rth->dst.input = ip_forward;
        rth->dst.output = ip_output;
-        rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
+        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+out:
        *result = rth;
        err = 0;
 cleanup:
@@ -2211,7 +1458,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
        struct rtable *rth = NULL;
        int err;
-        unsigned int hash;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && res->fi->fib_nhs > 1)
@@ -2223,12 +1469,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
        if (err)
                return err;
-        /* put it into the cache */
+        skb_dst_set(skb, &rth->dst);
-        hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
-                       rt_genid(dev_net(rth->dst.dev)));
-        rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
-        if (IS_ERR(rth))
-                return PTR_ERR(rth);
        return 0;
 }
@@ -2252,10 +1493,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        unsigned int    flags = 0;
        u32             itag = 0;
        struct rtable   *rth;
-        unsigned int    hash;
-        __be32          spec_dst;
        int             err = -EINVAL;
        struct net    *net = dev_net(dev);
+        bool do_cache;
        /* IP on this device is disabled. */
@@ -2266,10 +1506,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
           by fib_lookup.
         */
-        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
-            ipv4_is_loopback(saddr))
                goto martian_source;
+        res.fi = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;
@@ -2279,9 +1519,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(saddr))
                goto martian_source;
-        if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
+        if (ipv4_is_zeronet(daddr))
                goto martian_destination;
+        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
+                if (ipv4_is_loopback(daddr))
+                        goto martian_destination;
+                if (ipv4_is_loopback(saddr))
+                        goto martian_source;
+        }
        /*
         *      Now we are ready to route packet.
         */
@@ -2293,11 +1541,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        err = fib_lookup(net, &fl4, &res);
-        if (err != 0) {
+        if (err != 0)
-                if (!IN_DEV_FORWARD(in_dev))
-                        goto e_hostunreach;
                goto no_route;
-        }
        RT_CACHE_STAT_INC(in_slow_tot);
@@ -2307,17 +1552,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (res.type == RTN_LOCAL) {
                err = fib_validate_source(skb, saddr, daddr, tos,
                                          net->loopback_dev->ifindex,
-                                          dev, &spec_dst, &itag);
+                                          dev, in_dev, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
-                if (err)
-                        flags |= RTCF_DIRECTSRC;
-                spec_dst = daddr;
                goto local_input;
        }
        if (!IN_DEV_FORWARD(in_dev))
-                goto e_hostunreach;
+                goto no_route;
        if (res.type != RTN_UNICAST)
                goto martian_destination;
@@ -2328,23 +1570,31 @@ brd_input:
        if (skb->protocol != htons(ETH_P_IP))
                goto e_inval;
-        if (ipv4_is_zeronet(saddr))
+        if (!ipv4_is_zeronet(saddr)) {
-                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
-        else {
+                                          in_dev, &itag);
-                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
-                                          &itag);
                if (err < 0)
                        goto martian_source_keep_err;
-                if (err)
-                        flags |= RTCF_DIRECTSRC;
        }
        flags |= RTCF_BROADCAST;
        res.type = RTN_BROADCAST;
        RT_CACHE_STAT_INC(in_brd);
 local_input:
+        do_cache = false;
+        if (res.fi) {
+                if (!itag) {
+                        rth = FIB_RES_NH(res).nh_rth_input;
+                        if (rt_cache_valid(rth)) {
+                                dst_hold(&rth->dst);
+                                goto set_and_out;
+                        }
+                        do_cache = true;
+                }
+        }
        rth = rt_dst_alloc(net->loopback_dev,
-                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
        if (!rth)
                goto e_nobufs;
@@ -2354,41 +1604,27 @@ local_input:
        rth->dst.tclassid = itag;
 #endif
-        rth->rt_key_dst = daddr;
-        rth->rt_key_src = saddr;
        rth->rt_genid = rt_genid(net);
        rth->rt_flags   = flags|RTCF_LOCAL;
        rth->rt_type    = res.type;
-        rth->rt_key_tos = tos;
+        rth->rt_is_input = 1;
-        rth->rt_dst     = daddr;
+        rth->rt_iif     = 0;
-        rth->rt_src     = saddr;
+        rth->rt_pmtu    = 0;
-#ifdef CONFIG_IP_ROUTE_CLASSID
+        rth->rt_gateway = 0;
-        rth->dst.tclassid = itag;
-#endif
-        rth->rt_route_iif = dev->ifindex;
-        rth->rt_iif     = dev->ifindex;
-        rth->rt_oif     = 0;
-        rth->rt_mark    = skb->mark;
-        rth->rt_gateway = daddr;
-        rth->rt_spec_dst= spec_dst;
-        rth->rt_peer_genid = 0;
-        rth->peer = NULL;
-        rth->fi = NULL;
        if (res.type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags   &= ~RTCF_LOCAL;
        }
-        hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
+        if (do_cache)
-        rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+                rt_cache_route(&FIB_RES_NH(res), rth);
+set_and_out:
+        skb_dst_set(skb, &rth->dst);
        err = 0;
-        if (IS_ERR(rth))
-                err = PTR_ERR(rth);
        goto out;
 no_route:
        RT_CACHE_STAT_INC(in_no_route);
-        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
        res.type = RTN_UNREACHABLE;
        if (err == -ESRCH)
                err = -ENETUNREACH;
@@ -2405,10 +1641,6 @@ martian_destination:
                                     &daddr, &saddr, dev->name);
 #endif
-e_hostunreach:
-        err = -EHOSTUNREACH;
-        goto out;
 e_inval:
        err = -EINVAL;
        goto out;
@@ -2424,50 +1656,13 @@ martian_source_keep_err:
        goto out;
 }
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-                           u8 tos, struct net_device *dev, bool noref)
+                   u8 tos, struct net_device *dev)
 {
-        struct rtable   *rth;
-        unsigned int    hash;
-        int iif = dev->ifindex;
-        struct net *net;
        int res;
-        net = dev_net(dev);
        rcu_read_lock();
-        if (!rt_caching(net))
-                goto skip_cache;
-        tos &= IPTOS_RT_MASK;
-        hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-             rth = rcu_dereference(rth->dst.rt_next)) {
-                if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
-                     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
-                     (rth->rt_route_iif ^ iif) |
-                     (rth->rt_key_tos ^ tos)) == 0 &&
-                    rth->rt_mark == skb->mark &&
-                    net_eq(dev_net(rth->dst.dev), net) &&
-                    !rt_is_expired(rth)) {
-                        ipv4_validate_peer(rth);
-                        if (noref) {
-                                dst_use_noref(&rth->dst, jiffies);
-                                skb_dst_set_noref(skb, &rth->dst);
-                        } else {
-                                dst_use(&rth->dst, jiffies);
-                                skb_dst_set(skb, &rth->dst);
-                        }
-                        RT_CACHE_STAT_INC(in_hit);
-                        rcu_read_unlock();
-                        return 0;
-                }
-                RT_CACHE_STAT_INC(in_hlist_search);
-        }
-skip_cache:
        /* Multicast recognition logic is moved from route cache to here.
           The problem was that too many Ethernet cards have broken/missing
           hardware multicast filters :-( As result the host on multicasting
@@ -2505,24 +1700,28 @@ skip_cache:
        rcu_read_unlock();
        return res;
 }
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-                                       const struct flowi4 *fl4,
+                                       const struct flowi4 *fl4, int orig_oif,
-                                       __be32 orig_daddr, __be32 orig_saddr,
-                                       int orig_oif, __u8 orig_rtos,
                                       struct net_device *dev_out,
                                       unsigned int flags)
 {
        struct fib_info *fi = res->fi;
+        struct fib_nh_exception *fnhe;
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
-        if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+        in_dev = __in_dev_get_rcu(dev_out);
+        if (!in_dev)
                return ERR_PTR(-EINVAL);
+        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+                        return ERR_PTR(-EINVAL);
        if (ipv4_is_lbcast(fl4->daddr))
                type = RTN_BROADCAST;
        else if (ipv4_is_multicast(fl4->daddr))
@@ -2533,10 +1732,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;
-        in_dev = __in_dev_get_rcu(dev_out);
-        if (!in_dev)
-                return ERR_PTR(-EINVAL);
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
                fi = NULL;
@@ -2553,40 +1748,39 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                        fi = NULL;
        }
+        fnhe = NULL;
+        if (fi) {
+                fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+                if (!fnhe) {
+                        rth = FIB_RES_NH(*res).nh_rth_output;
+                        if (rt_cache_valid(rth)) {
+                                dst_hold(&rth->dst);
+                                return rth;
+                        }
+                }
+        }
        rth = rt_dst_alloc(dev_out,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
-                           IN_DEV_CONF_GET(in_dev, NOXFRM));
+                           IN_DEV_CONF_GET(in_dev, NOXFRM),
+                           fi && !fnhe);
        if (!rth)
                return ERR_PTR(-ENOBUFS);
        rth->dst.output = ip_output;
-        rth->rt_key_dst = orig_daddr;
-        rth->rt_key_src = orig_saddr;
        rth->rt_genid = rt_genid(dev_net(dev_out));
        rth->rt_flags   = flags;
        rth->rt_type    = type;
-        rth->rt_key_tos = orig_rtos;
+        rth->rt_is_input = 0;
-        rth->rt_dst     = fl4->daddr;
+        rth->rt_iif     = orig_oif ? : 0;
-        rth->rt_src     = fl4->saddr;
+        rth->rt_pmtu    = 0;
-        rth->rt_route_iif = 0;
+        rth->rt_gateway = 0;
-        rth->rt_iif     = orig_oif ? : dev_out->ifindex;
-        rth->rt_oif     = orig_oif;
-        rth->rt_mark    = fl4->flowi4_mark;
-        rth->rt_gateway = fl4->daddr;
-        rth->rt_spec_dst= fl4->saddr;
-        rth->rt_peer_genid = 0;
-        rth->peer = NULL;
-        rth->fi = NULL;
        RT_CACHE_STAT_INC(out_slow_tot);
-        if (flags & RTCF_LOCAL) {
+        if (flags & RTCF_LOCAL)
                rth->dst.input = ip_local_deliver;
-                rth->rt_spec_dst = fl4->daddr;
-        }
        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
-                rth->rt_spec_dst = fl4->saddr;
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
                        rth->dst.output = ip_mc_output;
@@ -2603,34 +1797,28 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 #endif
        }
-        rt_set_nexthop(rth, fl4, res, fi, type, 0);
+        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
        return rth;
 }
 /*
 * Major route resolver routine.
- * called with rcu_read_lock();
 */
-static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 {
        struct net_device *dev_out = NULL;
        __u8 tos = RT_FL_TOS(fl4);
        unsigned int flags = 0;
        struct fib_result res;
        struct rtable *rth;
-        __be32 orig_daddr;
-        __be32 orig_saddr;
        int orig_oif;
+        res.tclassid    = 0;
        res.fi          = NULL;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.table       = NULL;
-        res.r           = NULL;
-#endif
-        orig_daddr = fl4->daddr;
-        orig_saddr = fl4->saddr;
        orig_oif = fl4->flowi4_oif;
        fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -2730,6 +1918,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
        if (fib_lookup(net, fl4, &res)) {
                res.fi = NULL;
+                res.table = NULL;
                if (fl4->flowi4_oif) {
                        /* Apparently, routing tables are wrong. Assume,
                           that the destination is on link.
@@ -2791,60 +1980,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 make_route:
-        rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
+        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
-                               tos, dev_out, flags);
-        if (!IS_ERR(rth)) {
-                unsigned int hash;
-                hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
-                               rt_genid(dev_net(dev_out)));
-                rth = rt_intern_hash(hash, rth, NULL, orig_oif);
-        }
 out:
        rcu_read_unlock();
        return rth;
 }
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
-{
-        struct rtable *rth;
-        unsigned int hash;
-        if (!rt_caching(net))
-                goto slow_output;
-        hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
-        rcu_read_lock_bh();
-        for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-                rth = rcu_dereference_bh(rth->dst.rt_next)) {
-                if (rth->rt_key_dst == flp4->daddr &&
-                    rth->rt_key_src == flp4->saddr &&
-                    rt_is_output_route(rth) &&
-                    rth->rt_oif == flp4->flowi4_oif &&
-                    rth->rt_mark == flp4->flowi4_mark &&
-                    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
-                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
-                    net_eq(dev_net(rth->dst.dev), net) &&
-                    !rt_is_expired(rth)) {
-                        ipv4_validate_peer(rth);
-                        dst_use(&rth->dst, jiffies);
-                        RT_CACHE_STAT_INC(out_hit);
-                        rcu_read_unlock_bh();
-                        if (!flp4->saddr)
-                                flp4->saddr = rth->rt_src;
-                        if (!flp4->daddr)
-                                flp4->daddr = rth->rt_dst;
-                        return rth;
-                }
-                RT_CACHE_STAT_INC(out_hlist_search);
-        }
-        rcu_read_unlock_bh();
-slow_output:
-        return ip_route_output_slow(net, flp4);
-}
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -2859,7 +2000,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
        return mtu ? : dst->dev->mtu;
 }
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                                          struct sk_buff *skb, u32 mtu)
+{
+}
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+                                       struct sk_buff *skb)
 {
 }
@@ -2872,53 +2019,40 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
 static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                 =       AF_INET,
        .protocol               =       cpu_to_be16(ETH_P_IP),
-        .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_blackhole_dst_check,
        .mtu                    =       ipv4_blackhole_mtu,
        .default_advmss         =       ipv4_default_advmss,
        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
+        .redirect               =       ipv4_rt_blackhole_redirect,
        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
        .neigh_lookup           =       ipv4_neigh_lookup,
 };
 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
-        struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
        struct rtable *ort = (struct rtable *) dst_orig;
+        struct rtable *rt;
+        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard;
-                dst_copy_metrics(new, &ort->dst);
                new->dev = ort->dst.dev;
                if (new->dev)
                        dev_hold(new->dev);
-                rt->rt_key_dst = ort->rt_key_dst;
+                rt->rt_is_input = ort->rt_is_input;
-                rt->rt_key_src = ort->rt_key_src;
-                rt->rt_key_tos = ort->rt_key_tos;
-                rt->rt_route_iif = ort->rt_route_iif;
                rt->rt_iif = ort->rt_iif;
-                rt->rt_oif = ort->rt_oif;
+                rt->rt_pmtu = ort->rt_pmtu;
-                rt->rt_mark = ort->rt_mark;
                rt->rt_genid = rt_genid(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
-                rt->rt_dst = ort->rt_dst;
-                rt->rt_src = ort->rt_src;
                rt->rt_gateway = ort->rt_gateway;
-                rt->rt_spec_dst = ort->rt_spec_dst;
-                rt->peer = ort->peer;
-                if (rt->peer)
-                        atomic_inc(&rt->peer->refcnt);
-                rt->fi = ort->fi;
-                if (rt->fi)
-                        atomic_inc(&rt->fi->fib_clntref);
                dst_free(new);
        }
@@ -2945,16 +2079,16 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
-static int rt_fill_info(struct net *net,
+static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
-                        struct sk_buff *skb, u32 pid, u32 seq, int event,
+                        struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
-                        int nowait, unsigned int flags)
+                        u32 seq, int event, int nowait, unsigned int flags)
 {
        struct rtable *rt = skb_rtable(skb);
        struct rtmsg *r;
        struct nlmsghdr *nlh;
        unsigned long expires = 0;
-        const struct inet_peer *peer = rt->peer;
+        u32 error;
-        u32 id = 0, ts = 0, tsage = 0, error;
+        u32 metrics[RTAX_MAX];
        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
        if (nlh == NULL)
@@ -2964,7 +2098,7 @@ static int rt_fill_info(struct net *net,
        r->rtm_family    = AF_INET;
        r->rtm_dst_len  = 32;
        r->rtm_src_len  = 0;
-        r->rtm_tos      = rt->rt_key_tos;
+        r->rtm_tos      = fl4->flowi4_tos;
        r->rtm_table    = RT_TABLE_MAIN;
        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
                goto nla_put_failure;
@@ -2975,11 +2109,11 @@ static int rt_fill_info(struct net *net,
        if (rt->rt_flags & RTCF_NOTIFY)
                r->rtm_flags |= RTM_F_NOTIFY;
-        if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+        if (nla_put_be32(skb, RTA_DST, dst))
                goto nla_put_failure;
-        if (rt->rt_key_src) {
+        if (src) {
                r->rtm_src_len = 32;
-                if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+                if (nla_put_be32(skb, RTA_SRC, src))
                        goto nla_put_failure;
        }
        if (rt->dst.dev &&
@@ -2990,69 +2124,40 @@ static int rt_fill_info(struct net *net,
            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
                goto nla_put_failure;
 #endif
-        if (rt_is_input_route(rt)) {
+        if (!rt_is_input_route(rt) &&
-                if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
+            fl4->saddr != src) {
-                        goto nla_put_failure;
+                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
-        } else if (rt->rt_src != rt->rt_key_src) {
-                if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
                        goto nla_put_failure;
        }
-        if (rt->rt_dst != rt->rt_gateway &&
+        if (rt->rt_gateway &&
            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
                goto nla_put_failure;
-        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+        if (rt->rt_pmtu)
+                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
-        if (rt->rt_mark &&
+        if (fl4->flowi4_mark &&
-            nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+            nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
                goto nla_put_failure;
        error = rt->dst.error;
-        if (peer) {
+        expires = rt->dst.expires;
-                inet_peer_refcheck(rt->peer);
+        if (expires) {
-                id = atomic_read(&peer->ip_id_count) & 0xffff;
+                if (time_before(jiffies, expires))
-                if (peer->tcp_ts_stamp) {
+                        expires -= jiffies;
-                        ts = peer->tcp_ts;
+                else
-                        tsage = get_seconds() - peer->tcp_ts_stamp;
+                        expires = 0;
-                }
-                expires = ACCESS_ONCE(peer->pmtu_expires);
-                if (expires) {
-                        if (time_before(jiffies, expires))
-                                expires -= jiffies;
-                        else
-                                expires = 0;
-                }
        }
        if (rt_is_input_route(rt)) {
-#ifdef CONFIG_IP_MROUTE
+                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
-                __be32 dst = rt->rt_dst;
+                        goto nla_put_failure;
-                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
-                        int err = ipmr_get_route(net, skb,
-                                                 rt->rt_src, rt->rt_dst,
-                                                 r, nowait);
-                        if (err <= 0) {
-                                if (!nowait) {
-                                        if (err == 0)
-                                                return 0;
-                                        goto nla_put_failure;
-                                } else {
-                                        if (err == -EMSGSIZE)
-                                                goto nla_put_failure;
-                                        error = err;
-                                }
-                        }
-                } else
-#endif
-                        if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
-                                goto nla_put_failure;
        }
-        if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
+        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
-                               expires, error) < 0)
                goto nla_put_failure;
        return nlmsg_end(skb, nlh);
@@ -3068,6 +2173,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        struct rtmsg *rtm;
        struct nlattr *tb[RTA_MAX+1];
        struct rtable *rt = NULL;
+        struct flowi4 fl4;
        __be32 dst = 0;
        __be32 src = 0;
        u32 iif;
@@ -3102,6 +2208,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+        memset(&fl4, 0, sizeof(fl4));
+        fl4.daddr = dst;
+        fl4.saddr = src;
+        fl4.flowi4_tos = rtm->rtm_tos;
+        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+        fl4.flowi4_mark = mark;
        if (iif) {
                struct net_device *dev;
@@ -3122,13 +2235,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
-                struct flowi4 fl4 = {
-                        .daddr = dst,
-                        .saddr = src,
-                        .flowi4_tos = rtm->rtm_tos,
-                        .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-                        .flowi4_mark = mark,
-                };
                rt = ip_route_output_key(net, &fl4);
                err = 0;
@@ -3143,7 +2249,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;
-        err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+        err = rt_fill_info(net, dst, src, &fl4, skb,
+                           NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err <= 0)
                goto errout_free;
@@ -3159,43 +2266,6 @@ errout_free:
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
-        struct rtable *rt;
-        int h, s_h;
-        int idx, s_idx;
-        struct net *net;
-        net = sock_net(skb->sk);
-        s_h = cb->args[0];
-        if (s_h < 0)
-                s_h = 0;
-        s_idx = idx = cb->args[1];
-        for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
-                if (!rt_hash_table[h].chain)
-                        continue;
-                rcu_read_lock_bh();
-                for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-                     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
-                        if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
-                                continue;
-                        if (rt_is_expired(rt))
-                                continue;
-                        skb_dst_set_noref(skb, &rt->dst);
-                        if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
-                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-                                         1, NLM_F_MULTI) <= 0) {
-                                skb_dst_drop(skb);
-                                rcu_read_unlock_bh();
-                                goto done;
-                        }
-                        skb_dst_drop(skb);
-                }
-                rcu_read_unlock_bh();
-        }
-done:
-        cb->args[0] = h;
-        cb->args[1] = idx;
        return skb->len;
 }
@@ -3400,26 +2470,34 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
        .init = rt_genid_init,
 };
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
-#ifdef CONFIG_IP_ROUTE_CLASSID
+        if (!bp)
-struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+                return -ENOMEM;
-#endif /* CONFIG_IP_ROUTE_CLASSID */
+        inet_peer_base_init(bp);
+        net->ipv4.peers = bp;
+        return 0;
+}
-static __initdata unsigned long rhash_entries;
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
-static int __init set_rhash_entries(char *str)
 {
-        ssize_t ret;
+        struct inet_peer_base *bp = net->ipv4.peers;
-        if (!str)
+        net->ipv4.peers = NULL;
-                return 0;
+        inetpeer_invalidate_tree(bp);
+        kfree(bp);
+}
-        ret = kstrtoul(str, 0, &rhash_entries);
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
-        if (ret)
+        .init   =       ipv4_inetpeer_init,
-                return 0;
+        .exit   =       ipv4_inetpeer_exit,
+};
-        return 1;
+#ifdef CONFIG_IP_ROUTE_CLASSID
-}
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
-__setup("rhash_entries=", set_rhash_entries);
+#endif /* CONFIG_IP_ROUTE_CLASSID */
 int __init ip_rt_init(void)
 {
@@ -3443,31 +2521,12 @@ int __init ip_rt_init(void)
        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
-        rt_hash_table = (struct rt_hash_bucket *)
+        ipv4_dst_ops.gc_thresh = ~0;
-                alloc_large_system_hash("IP route cache",
+        ip_rt_max_size = INT_MAX;
-                                        sizeof(struct rt_hash_bucket),
-                                        rhash_entries,
-                                        (totalram_pages >= 128 * 1024) ?
-                                        15 : 17,
-                                        0,
-                                        &rt_hash_log,
-                                        &rt_hash_mask,
-                                        0,
-                                        rhash_entries ? 0 : 512 * 1024);
-        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-        rt_hash_lock_init();
-        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-        ip_rt_max_size = (rt_hash_mask + 1) * 16;
        devinet_init();
        ip_fib_init();
-        INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
-        expires_ljiffies = jiffies;
-        schedule_delayed_work(&expires_work,
-                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
        if (ip_rt_proc_init())
                pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
@@ -3480,6 +2539,7 @@ int __init ip_rt_init(void)
        register_pernet_subsys(&sysctl_route_ops);
 #endif
        register_pernet_subsys(&rt_genid_ops);
+        register_pernet_subsys(&ipv4_inetpeer_ops);
        return rc;
 }
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7fb15d1..650e1528e1e6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        /* check for timestamp cookie support */
        memset(&tcp_opt, 0, sizeof(tcp_opt));
-        tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+        tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
        if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
                goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ef32956ed655..5840c3255721 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -301,6 +301,13 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec
        },
        {
+                .procname       = "ip_early_demux",
+                .data           = &sysctl_ip_early_demux,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec
+        },
+        {
                .procname       = "ip_dynaddr",
                .data           = &sysctl_ip_dynaddr,
                .maxlen         = sizeof(int),
@@ -360,6 +367,13 @@ static struct ctl_table ipv4_table[] = {
        },
 #endif
        {
+                .procname       = "tcp_fastopen",
+                .data           = &sysctl_tcp_fastopen,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
                .procname       = "tcp_tw_recycle",
                .data           = &tcp_death_row.sysctl_tw_recycle,
                .maxlen         = sizeof(int),
@@ -591,6 +605,20 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+        {
+                .procname       = "tcp_limit_output_bytes",
+                .data           = &sysctl_tcp_limit_output_bytes,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec
+        },
+        {
+                .procname       = "tcp_challenge_ack_limit",
+                .data           = &sysctl_tcp_challenge_ack_limit,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec
+        },
 #ifdef CONFIG_NET_DMA
        {
                .procname       = "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4e..581ecf02c6b5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/slab.h>
 #include <net/icmp.h>
+#include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -376,6 +377,7 @@ void tcp_init_sock(struct sock *sk)
        skb_queue_head_init(&tp->out_of_order_queue);
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
+        INIT_LIST_HEAD(&tp->tsq_node);
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev = TCP_TIMEOUT_INIT;
@@ -796,6 +798,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                                  inet_csk(sk)->icsk_ext_hdr_len -
                                  tp->tcp_header_len);
+                /* TSQ : try to have two TSO segments in flight */
+                xmit_size_goal = min_t(u32, xmit_size_goal,
+                                       sysctl_tcp_limit_output_bytes >> 1);
                xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
                /* We try hard to avoid divides here */
@@ -977,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg)
        return tmp;
 }
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+        if (tp->fastopen_req != NULL) {
+                kfree(tp->fastopen_req);
+                tp->fastopen_req = NULL;
+        }
+}
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int err, flags;
+        if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+                return -EOPNOTSUPP;
+        if (tp->fastopen_req != NULL)
+                return -EALREADY; /* Another Fast Open is in progress */
+        tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+                                   sk->sk_allocation);
+        if (unlikely(tp->fastopen_req == NULL))
+                return -ENOBUFS;
+        tp->fastopen_req->data = msg;
+        flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+        err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+                                    msg->msg_namelen, flags);
+        *size = tp->fastopen_req->copied;
+        tcp_free_fastopen_req(tp);
+        return err;
+}
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                size_t size)
 {
        struct iovec *iov;
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-        int iovlen, flags, err, copied;
+        int iovlen, flags, err, copied = 0;
-        int mss_now = 0, size_goal;
+        int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
        bool sg;
        long timeo;
        lock_sock(sk);
        flags = msg->msg_flags;
+        if (flags & MSG_FASTOPEN) {
+                err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+                if (err == -EINPROGRESS && copied_syn > 0)
+                        goto out;
+                else if (err)
+                        goto out_err;
+                offset = copied_syn;
+        }
        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
        /* Wait for a connection to finish. */
        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
-                        goto out_err;
+                        goto do_error;
        if (unlikely(tp->repair)) {
                if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1032,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                unsigned char __user *from = iov->iov_base;
                iov++;
+                if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
+                        if (offset >= seglen) {
+                                offset -= seglen;
+                                continue;
+                        }
+                        seglen -= offset;
+                        from += offset;
+                        offset = 0;
+                }
                while (seglen > 0) {
                        int copy = 0;
@@ -1194,7 +1250,7 @@ out:
        if (copied && likely(!tp->repair))
                tcp_push(sk, flags, mss_now, tp->nonagle);
        release_sock(sk);
-        return copied;
+        return copied + copied_syn;
 do_fault:
        if (!skb->len) {
@@ -1207,7 +1263,7 @@ do_fault:
        }
 do_error:
-        if (copied)
+        if (copied + copied_syn)
                goto out;
 out_err:
        err = sk_stream_error(sk, flags, err);
@@ -3310,8 +3366,7 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 #endif
-/**
+/* Each Responder maintains up to two secret values concurrently for
- * Each Responder maintains up to two secret values concurrently for
 * efficient secret rollover.  Each secret value has 4 states:
 *
 * Generating.  (tcp_secret_generating != tcp_secret_primary)
@@ -3563,6 +3618,8 @@ void __init tcp_init(void)
        pr_info("Hash tables configured (established %u bind %u)\n",
                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+        tcp_metrics_init();
        tcp_register_congestion_control(&tcp_reno);
        memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
@@ -3573,4 +3630,5 @@ void __init tcp_init(void)
        tcp_secret_primary = &tcp_secret_one;
        tcp_secret_retiring = &tcp_secret_two;
        tcp_secret_secondary = &tcp_secret_two;
+        tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 04dbd7ae7c62..4d4db16e336e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -307,6 +307,7 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
 void tcp_slow_start(struct tcp_sock *tp)
 {
        int cnt; /* increase in packets */
+        unsigned int delta = 0;
        /* RFC3465: ABC Slow start
         * Increase only after a full MSS of bytes is acked
@@ -333,9 +334,9 @@ void tcp_slow_start(struct tcp_sock *tp)
        tp->snd_cwnd_cnt += cnt;
        while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
                tp->snd_cwnd_cnt -= tp->snd_cwnd;
-                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                delta++;
-                        tp->snd_cwnd++;
        }
+        tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000000..a7f729c409d7
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+int sysctl_tcp_fastopen;
+static int __init tcp_fastopen_init(void)
+{
+        return 0;
+}
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b224eb8bce8b..3e07a64ca44e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -88,12 +88,14 @@ int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
 int sysctl_tcp_thin_dupack __read_mostly;
@@ -701,7 +703,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        /* Old crap is replaced with new one. 8)
@@ -728,109 +730,6 @@ static inline void tcp_set_rto(struct sock *sk)
        tcp_bound_rto(sk);
 }
-/* Save metrics learned by this TCP session.
-   This function is called only, when TCP finishes successfully
-   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct dst_entry *dst = __sk_dst_get(sk);
-        if (sysctl_tcp_nometrics_save)
-                return;
-        dst_confirm(dst);
-        if (dst && (dst->flags & DST_HOST)) {
-                const struct inet_connection_sock *icsk = inet_csk(sk);
-                int m;
-                unsigned long rtt;
-                if (icsk->icsk_backoff || !tp->srtt) {
-                        /* This session failed to estimate rtt. Why?
-                         * Probably, no packets returned in time.
-                         * Reset our results.
-                         */
-                        if (!(dst_metric_locked(dst, RTAX_RTT)))
-                                dst_metric_set(dst, RTAX_RTT, 0);
-                        return;
-                }
-                rtt = dst_metric_rtt(dst, RTAX_RTT);
-                m = rtt - tp->srtt;
-                /* If newly calculated rtt larger than stored one,
-                 * store new one. Otherwise, use EWMA. Remember,
-                 * rtt overestimation is always better than underestimation.
-                 */
-                if (!(dst_metric_locked(dst, RTAX_RTT))) {
-                        if (m <= 0)
-                                set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
-                        else
-                                set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
-                }
-                if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
-                        unsigned long var;
-                        if (m < 0)
-                                m = -m;
-                        /* Scale deviation to rttvar fixed point */
-                        m >>= 1;
-                        if (m < tp->mdev)
-                                m = tp->mdev;
-                        var = dst_metric_rtt(dst, RTAX_RTTVAR);
-                        if (m >= var)
-                                var = m;
-                        else
-                                var -= (var - m) >> 2;
-                        set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
-                }
-                if (tcp_in_initial_slowstart(tp)) {
-                        /* Slow start still did not finish. */
-                        if (dst_metric(dst, RTAX_SSTHRESH) &&
-                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-                            (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
-                                dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
-                        if (!dst_metric_locked(dst, RTAX_CWND) &&
-                            tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
-                                dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
-                } else if (tp->snd_cwnd > tp->snd_ssthresh &&
-                           icsk->icsk_ca_state == TCP_CA_Open) {
-                        /* Cong. avoidance phase, cwnd is reliable. */
-                        if (!dst_metric_locked(dst, RTAX_SSTHRESH))
-                                dst_metric_set(dst, RTAX_SSTHRESH,
-                                               max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
-                        if (!dst_metric_locked(dst, RTAX_CWND))
-                                dst_metric_set(dst, RTAX_CWND,
-                                               (dst_metric(dst, RTAX_CWND) +
-                                                tp->snd_cwnd) >> 1);
-                } else {
-                        /* Else slow start did not finish, cwnd is non-sense,
-                           ssthresh may be also invalid.
-                         */
-                        if (!dst_metric_locked(dst, RTAX_CWND))
-                                dst_metric_set(dst, RTAX_CWND,
-                                               (dst_metric(dst, RTAX_CWND) +
-                                                tp->snd_ssthresh) >> 1);
-                        if (dst_metric(dst, RTAX_SSTHRESH) &&
-                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
-                            tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
-                                dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
-                }
-                if (!dst_metric_locked(dst, RTAX_REORDERING)) {
-                        if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
-                            tp->reordering != sysctl_tcp_reordering)
-                                dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
-                }
-        }
-}
 __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
 {
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +766,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
 */
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
 {
        /* RFC3517 uses different metric in lost marker => reset on change */
        if (tcp_is_fack(tp))
@@ -881,86 +780,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
        tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
 }
-/* Initialize metrics on socket. */
-static void tcp_init_metrics(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct dst_entry *dst = __sk_dst_get(sk);
-        if (dst == NULL)
-                goto reset;
-        dst_confirm(dst);
-        if (dst_metric_locked(dst, RTAX_CWND))
-                tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
-        if (dst_metric(dst, RTAX_SSTHRESH)) {
-                tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
-                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
-        } else {
-                /* ssthresh may have been reduced unnecessarily during.
-                 * 3WHS. Restore it back to its initial default.
-                 */
-                tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-        }
-        if (dst_metric(dst, RTAX_REORDERING) &&
-            tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
-                tcp_disable_fack(tp);
-                tcp_disable_early_retrans(tp);
-                tp->reordering = dst_metric(dst, RTAX_REORDERING);
-        }
-        if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
-                goto reset;
-        /* Initial rtt is determined from SYN,SYN-ACK.
-         * The segment is small and rtt may appear much
-         * less than real one. Use per-dst memory
-         * to make it more realistic.
-         *
-         * A bit of theory. RTT is time passed after "normal" sized packet
-         * is sent until it is ACKed. In normal circumstances sending small
-         * packets force peer to delay ACKs and calculation is correct too.
-         * The algorithm is adaptive and, provided we follow specs, it
-         * NEVER underestimate RTT. BUT! If peer tries to make some clever
-         * tricks sort of "quick acks" for time long enough to decrease RTT
-         * to low value, and then abruptly stops to do it and starts to delay
-         * ACKs, wait for troubles.
-         */
-        if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
-                tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
-                tp->rtt_seq = tp->snd_nxt;
-        }
-        if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
-                tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
-                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-        }
-        tcp_set_rto(sk);
-reset:
-        if (tp->srtt == 0) {
-                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
-                 * 3WHS. This is most likely due to retransmission,
-                 * including spurious one. Reset the RTO back to 3secs
-                 * from the more aggressive 1sec to avoid more spurious
-                 * retransmission.
-                 */
-                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
-                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
-        }
-        /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
-         * retransmitted. In light of RFC6298 more aggressive 1sec
-         * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
-         * retransmission has occurred.
-         */
-        if (tp->total_retrans > 1)
-                tp->snd_cwnd = 1;
-        else
-                tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-        tp->snd_cwnd_stamp = tcp_time_stamp;
-}
 static void tcp_update_reordering(struct sock *sk, const int metric,
                                  const int ts)
 {
@@ -2702,7 +2521,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
 /* Nothing was retransmitted or returned timestamp is less
 * than timestamp of the first retransmission.
 */
-static inline int tcp_packet_delayed(const struct tcp_sock *tp)
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
 {
        return !tp->retrans_stamp ||
                (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2763,7 +2582,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-static inline int tcp_may_undo(const struct tcp_sock *tp)
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
 {
        return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
 }
@@ -3552,13 +3371,13 @@ static void tcp_ack_probe(struct sock *sk)
        }
 }
-static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
        return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
                inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
-static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
@@ -3568,7 +3387,7 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 /* Check that window update is acceptable.
 * The function assumes that snd_una<=ack<=snd_next.
 */
-static inline int tcp_may_update_window(const struct tcp_sock *tp,
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
                                        const u32 ack, const u32 ack_seq,
                                        const u32 nwin)
 {
@@ -3869,9 +3688,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                        tcp_cong_avoid(sk, ack, prior_in_flight);
        }
-        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-                dst_confirm(__sk_dst_get(sk));
+                struct dst_entry *dst = __sk_dst_get(sk);
+                if (dst)
+                        dst_confirm(dst);
+        }
        return 1;
 no_queue:
@@ -3911,7 +3732,8 @@ old_ack:
 * the fast version below fails.
 */
 void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
-                       const u8 **hvpp, int estab)
+                       const u8 **hvpp, int estab,
+                       struct tcp_fastopen_cookie *foc)
 {
        const unsigned char *ptr;
        const struct tcphdr *th = tcp_hdr(skb);
@@ -4018,8 +3840,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
                                        break;
                                }
                                break;
-                        }
+                        case TCPOPT_EXP:
+                                /* Fast Open option shares code 254 using a
+                                 * 16 bits magic number. It's valid only in
+                                 * SYN or SYN-ACK with an even size.
+                                 */
+                                if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+                                    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+                                    foc == NULL || !th->syn || (opsize & 1))
+                                        break;
+                                foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+                                if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+                                    foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+                                        memcpy(foc->val, ptr + 2, foc->len);
+                                else if (foc->len != 0)
+                                        foc->len = -1;
+                                break;
+                        }
                        ptr += opsize-2;
                        length -= opsize;
                }
@@ -4061,7 +3900,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
                if (tcp_parse_aligned_timestamp(tp, th))
                        return true;
        }
-        tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
+        tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
        return true;
 }
@@ -4167,7 +4006,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
 }
-static inline int tcp_paws_discard(const struct sock *sk,
+static inline bool tcp_paws_discard(const struct sock *sk,
                                   const struct sk_buff *skb)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
@@ -4189,7 +4028,7 @@ static inline int tcp_paws_discard(const struct sock *sk,
 * (borrowed from freebsd)
 */
-static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
 {
        return  !before(end_seq, tp->rcv_wup) &&
                !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
@@ -4579,8 +4418,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
        TCP_ECN_check_ce(tp, skb);
-        if (tcp_try_rmem_schedule(sk, skb->truesize)) {
+        if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
-                /* TODO: should increment a counter */
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
                __kfree_skb(skb);
                return;
        }
@@ -4589,6 +4428,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
        tp->pred_flags = 0;
        inet_csk_schedule_ack(sk);
+        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4642,6 +4482,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
        if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
                if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
                        /* All the bits are present. Drop. */
+                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
                        __kfree_skb(skb);
                        skb = NULL;
                        tcp_dsack_set(sk, seq, end_seq);
@@ -4680,6 +4521,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
                __skb_unlink(skb1, &tp->out_of_order_queue);
                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
                                 TCP_SKB_CB(skb1)->end_seq);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
                __kfree_skb(skb1);
        }
@@ -5372,7 +5214,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
        return result;
 }
-static inline int tcp_checksum_complete_user(struct sock *sk,
+static inline bool tcp_checksum_complete_user(struct sock *sk,
                                             struct sk_buff *skb)
 {
        return !skb_csum_unnecessary(skb) &&
@@ -5426,11 +5268,28 @@ out:
 }
 #endif /* CONFIG_NET_DMA */
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+        /* unprotected vars, we dont care of overwrites */
+        static u32 challenge_timestamp;
+        static unsigned int challenge_count;
+        u32 now = jiffies / HZ;
+        if (now != challenge_timestamp) {
+                challenge_timestamp = now;
+                challenge_count = 0;
+        }
+        if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+                tcp_send_ack(sk);
+        }
+}
 /* Does PAWS and seqno based validation of an incoming segment, flags will
 * play significant role here.
 */
-static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
-                              const struct tcphdr *th, int syn_inerr)
+                                  const struct tcphdr *th, int syn_inerr)
 {
        const u8 *hash_location;
        struct tcp_sock *tp = tcp_sk(sk);
@@ -5455,14 +5314,26 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
                 * an acknowledgment should be sent in reply (unless the RST
                 * bit is set, if so drop the segment and return)".
                 */
-                if (!th->rst)
+                if (!th->rst) {
+                        if (th->syn)
+                                goto syn_challenge;
                        tcp_send_dupack(sk, skb);
+                }
                goto discard;
        }
        /* Step 2: check RST bit */
        if (th->rst) {
-                tcp_reset(sk);
+                /* RFC 5961 3.2 :
+                 * If sequence number exactly matches RCV.NXT, then
+                 *     RESET the connection
+                 * else
+                 *     Send a challenge ACK
+                 */
+                if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+                        tcp_reset(sk);
+                else
+                        tcp_send_challenge_ack(sk);
                goto discard;
        }
@@ -5473,20 +5344,23 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
        /* step 3: check security and precedence [ignored] */
-        /* step 4: Check for a SYN in window. */
+        /* step 4: Check for a SYN
-        if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+         * RFC 5691 4.2 : Send a challenge ack
+         */
+        if (th->syn) {
+syn_challenge:
                if (syn_inerr)
                        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-                tcp_reset(sk);
+                tcp_send_challenge_ack(sk);
-                return -1;
+                goto discard;
        }
-        return 1;
+        return true;
 discard:
        __kfree_skb(skb);
-        return 0;
+        return false;
 }
 /*
@@ -5516,7 +5390,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        const struct tcphdr *th, unsigned int len)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        int res;
        /*
         *      Header prediction.
@@ -5693,9 +5566,8 @@ slow_path:
         *      Standard slow path.
         */
-        res = tcp_validate_incoming(sk, skb, th, 1);
+        if (!tcp_validate_incoming(sk, skb, th, 1))
-        if (res <= 0)
+                return 0;
-                return -res;
 step5:
        if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
@@ -5729,8 +5601,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
        tcp_set_state(sk, TCP_ESTABLISHED);
-        if (skb != NULL)
+        if (skb != NULL) {
+                sk->sk_rx_dst = dst_clone(skb_dst(skb));
                security_inet_conn_established(sk, skb);
+        }
        /* Make sure socket is routed, for correct metrics.  */
        icsk->icsk_af_ops->rebuild_header(sk);
@@ -5760,6 +5634,45 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
        }
 }
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+                                    struct tcp_fastopen_cookie *cookie)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+        u16 mss = tp->rx_opt.mss_clamp;
+        bool syn_drop;
+        if (mss == tp->rx_opt.user_mss) {
+                struct tcp_options_received opt;
+                const u8 *hash_location;
+                /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+                tcp_clear_options(&opt);
+                opt.user_mss = opt.mss_clamp = 0;
+                tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+                mss = opt.mss_clamp;
+        }
+        if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */
+                cookie->len = -1;
+        /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+         * the remote receives only the retransmitted (regular) SYNs: either
+         * the original SYN-data or the corresponding SYN-ACK is lost.
+         */
+        syn_drop = (cookie->len <= 0 && data &&
+                    inet_csk(sk)->icsk_retransmits);
+        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
+        if (data) { /* Retransmit unacked data in SYN */
+                tcp_retransmit_skb(sk, data);
+                tcp_rearm_rto(sk);
+                return true;
+        }
+        return false;
+}
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                                         const struct tcphdr *th, unsigned int len)
 {
@@ -5767,9 +5680,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_cookie_values *cvp = tp->cookie_values;
+        struct tcp_fastopen_cookie foc = { .len = -1 };
        int saved_clamp = tp->rx_opt.mss_clamp;
-        tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+        tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
        if (th->ack) {
                /* rfc793:
@@ -5779,11 +5693,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
                 *        a reset (unless the RST bit is set, if so drop
                 *        the segment and return)"
-                 *
-                 *  We do not send data with SYN, so that RFC-correct
-                 *  test reduces to:
                 */
-                if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+                if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+                    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
                        goto reset_and_undo;
                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5895,6 +5807,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_finish_connect(sk, skb);
+                if ((tp->syn_fastopen || tp->syn_data) &&
+                    tcp_rcv_fastopen_synack(sk, skb, &foc))
+                        return -1;
                if (sk->sk_write_pending ||
                    icsk->icsk_accept_queue.rskq_defer_accept ||
                    icsk->icsk_ack.pingpong) {
@@ -6013,7 +5929,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        int queued = 0;
-        int res;
        tp->rx_opt.saw_tstamp = 0;
@@ -6068,9 +5983,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                return 0;
        }
-        res = tcp_validate_incoming(sk, skb, th, 0);
+        if (!tcp_validate_incoming(sk, skb, th, 0))
-        if (res <= 0)
+                return 0;
-                return -res;
        /* step 5: check the ACK field */
        if (th->ack) {
@@ -6126,9 +6040,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                case TCP_FIN_WAIT1:
                        if (tp->snd_una == tp->write_seq) {
+                                struct dst_entry *dst;
                                tcp_set_state(sk, TCP_FIN_WAIT2);
                                sk->sk_shutdown |= SEND_SHUTDOWN;
-                                dst_confirm(__sk_dst_get(sk));
+                                dst = __sk_dst_get(sk);
+                                if (dst)
+                                        dst_confirm(dst);
                                if (!sock_flag(sk, SOCK_DEAD))
                                        /* Wake up lingering close() */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c8d28c433b2b..3e30548ac32a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        }
        if (tcp_death_row.sysctl_tw_recycle &&
-            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
+            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
-                struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
+                tcp_fetch_timewait_stamp(sk, &rt->dst);
-                /*
-                 * VJ's idea. We save last timestamp seen from
-                 * the destination in peer table, when entering state
-                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
-                 * when trying new connection.
-                 */
-                if (peer) {
-                        inet_peer_refcheck(peer);
-                        if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-                                tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-                                tp->rx_opt.ts_recent = peer->tcp_ts;
-                        }
-                }
-        }
        inet->inet_dport = usin->sin_port;
        inet->inet_daddr = daddr;
@@ -289,12 +275,15 @@ failure:
 EXPORT_SYMBOL(tcp_v4_connect);
 /*
- * This routine does path mtu discovery as defined in RFC1191.
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
 */
-static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
+static void tcp_v4_mtu_reduced(struct sock *sk)
 {
        struct dst_entry *dst;
        struct inet_sock *inet = inet_sk(sk);
+        u32 mtu = tcp_sk(sk)->mtu_info;
        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
         * send out by Linux are always <576bytes so they should go through
@@ -303,17 +292,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
        if (sk->sk_state == TCP_LISTEN)
                return;
-        /* We don't check in the destentry if pmtu discovery is forbidden
+        dst = inet_csk_update_pmtu(sk, mtu);
-         * on this route. We just assume that no packet_to_big packets
+        if (!dst)
-         * are send back when pmtu discovery is not active.
-         * There is a small race when the user changes this flag in the
-         * route, but I think that's acceptable.
-         */
-        if ((dst = __sk_dst_check(sk, 0)) == NULL)
                return;
-        dst->ops->update_pmtu(dst, mtu);
        /* Something is about to be wrong... Remember soft error
         * for the case, if this connection will not able to recover.
         */
@@ -335,6 +317,14 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
        } /* else let the usual retransmit timer handle it */
 }
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+        struct dst_entry *dst = __sk_dst_check(sk, 0);
+        if (dst)
+                dst->ops->redirect(dst, sk, skb);
+}
 /*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
@@ -386,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        bh_lock_sock(sk);
        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
+         * We do take care of PMTU discovery (RFC1191) special case :
+         * we can receive locally generated ICMP messages while socket is held.
         */
-        if (sock_owned_by_user(sk))
+        if (sock_owned_by_user(sk) &&
+            type != ICMP_DEST_UNREACH &&
+            code != ICMP_FRAG_NEEDED)
                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
        if (sk->sk_state == TCP_CLOSE)
@@ -408,6 +402,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        }
        switch (type) {
+        case ICMP_REDIRECT:
+                do_redirect(icmp_skb, sk);
+                goto out;
        case ICMP_SOURCE_QUENCH:
                /* Just silently ignore these. */
                goto out;
@@ -419,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                        goto out;
                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+                        tp->mtu_info = info;
                        if (!sock_owned_by_user(sk))
-                                do_pmtu_discovery(sk, iph, info);
+                                tcp_v4_mtu_reduced(sk);
+                        else
+                                set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
                        goto out;
                }
@@ -698,8 +698,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
        net = dev_net(skb_dst(skb)->dev);
        arg.tos = ip_hdr(skb)->tos;
-        ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
-                      &arg, arg.iov[0].iov_len);
+                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
@@ -781,8 +781,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
        if (oif)
                arg.bound_dev_if = oif;
        arg.tos = tos;
-        ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
-                      &arg, arg.iov[0].iov_len);
+                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 }
@@ -825,7 +825,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                              struct request_sock *req,
                              struct request_values *rvp,
-                              u16 queue_mapping)
+                              u16 queue_mapping,
+                              bool nocache)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
@@ -848,7 +849,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                err = net_xmit_eval(err);
        }
-        dst_release(dst);
        return err;
 }
@@ -856,7 +856,7 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
                              struct request_values *rvp)
 {
        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-        return tcp_v4_send_synack(sk, NULL, req, rvp, 0);
+        return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 }
 /*
@@ -1317,7 +1317,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-        tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+        tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
        if (tmp_opt.cookie_plus > 0 &&
            tmp_opt.saw_tstamp &&
@@ -1375,7 +1375,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
-                struct inet_peer *peer = NULL;
                struct flowi4 fl4;
                /* VJ's idea. We save last timestamp seen
@@ -1390,12 +1389,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                if (tmp_opt.saw_tstamp &&
                    tcp_death_row.sysctl_tw_recycle &&
                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
-                    fl4.daddr == saddr &&
+                    fl4.daddr == saddr) {
-                    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
+                        if (!tcp_peer_is_proven(req, dst, true)) {
-                        inet_peer_refcheck(peer);
-                        if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
-                            (s32)(peer->tcp_ts - req->ts_recent) >
-                                                        TCP_PAWS_WINDOW) {
                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
                                goto drop_and_release;
                        }
@@ -1404,8 +1399,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                else if (!sysctl_tcp_syncookies &&
                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                          (sysctl_max_syn_backlog >> 2)) &&
-                         (!peer || !peer->tcp_ts_stamp) &&
+                         !tcp_peer_is_proven(req, dst, false)) {
-                         (!dst || !dst_metric(dst, RTAX_RTT))) {
                        /* Without syncookies last quarter of
                         * backlog is filled with destinations,
                         * proven to be alive.
@@ -1425,7 +1419,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        if (tcp_v4_send_synack(sk, dst, req,
                               (struct request_values *)&tmp_ext,
-                               skb_get_queue_mapping(skb)) ||
+                               skb_get_queue_mapping(skb),
+                               want_cookie) ||
            want_cookie)
                goto drop_and_free;
@@ -1623,6 +1618,20 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                sock_rps_save_rxhash(sk, skb);
+                if (sk->sk_rx_dst) {
+                        struct dst_entry *dst = sk->sk_rx_dst;
+                        if (dst->ops->check(dst, 0) == NULL) {
+                                dst_release(dst);
+                                sk->sk_rx_dst = NULL;
+                        }
+                }
+                if (unlikely(sk->sk_rx_dst == NULL)) {
+                        struct inet_sock *icsk = inet_sk(sk);
+                        struct rtable *rt = skb_rtable(skb);
+                        sk->sk_rx_dst = dst_clone(&rt->dst);
+                        icsk->rx_dst_ifindex = inet_iif(skb);
+                }
                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
                        rsk = sk;
                        goto reset;
@@ -1672,6 +1681,49 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
+void tcp_v4_early_demux(struct sk_buff *skb)
+{
+        struct net *net = dev_net(skb->dev);
+        const struct iphdr *iph;
+        const struct tcphdr *th;
+        struct net_device *dev;
+        struct sock *sk;
+        if (skb->pkt_type != PACKET_HOST)
+                return;
+        if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
+                return;
+        iph = ip_hdr(skb);
+        th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
+        if (th->doff < sizeof(struct tcphdr) / 4)
+                return;
+        if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
+                return;
+        dev = skb->dev;
+        sk = __inet_lookup_established(net, &tcp_hashinfo,
+                                       iph->saddr, th->source,
+                                       iph->daddr, ntohs(th->dest),
+                                       dev->ifindex);
+        if (sk) {
+                skb->sk = sk;
+                skb->destructor = sock_edemux;
+                if (sk->sk_state != TCP_TIME_WAIT) {
+                        struct dst_entry *dst = sk->sk_rx_dst;
+                        struct inet_sock *icsk = inet_sk(sk);
+                        if (dst)
+                                dst = dst_check(dst, 0);
+                        if (dst &&
+                            icsk->rx_dst_ifindex == dev->ifindex)
+                                skb_dst_set_noref(skb, dst);
+                }
+        }
+}
 /*
 *      From tcp_input.c
 */
@@ -1821,40 +1873,10 @@ do_time_wait:
        goto discard_it;
 }
-struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
-{
-        struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
-        struct inet_sock *inet = inet_sk(sk);
-        struct inet_peer *peer;
-        if (!rt ||
-            inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
-                peer = inet_getpeer_v4(inet->inet_daddr, 1);
-                *release_it = true;
-        } else {
-                if (!rt->peer)
-                        rt_bind_peer(rt, inet->inet_daddr, 1);
-                peer = rt->peer;
-                *release_it = false;
-        }
-        return peer;
-}
-EXPORT_SYMBOL(tcp_v4_get_peer);
-void *tcp_v4_tw_get_peer(struct sock *sk)
-{
-        const struct inet_timewait_sock *tw = inet_twsk(sk);
-        return inet_getpeer_v4(tw->tw_daddr, 1);
-}
-EXPORT_SYMBOL(tcp_v4_tw_get_peer);
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
        .twsk_unique    = tcp_twsk_unique,
        .twsk_destructor= tcp_twsk_destructor,
-        .twsk_getpeer   = tcp_v4_tw_get_peer,
 };
 const struct inet_connection_sock_af_ops ipv4_specific = {
@@ -1863,7 +1885,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
        .rebuild_header    = inet_sk_rebuild_header,
        .conn_request      = tcp_v4_conn_request,
        .syn_recv_sock     = tcp_v4_syn_recv_sock,
-        .get_peer          = tcp_v4_get_peer,
        .net_header_len    = sizeof(struct iphdr),
        .setsockopt        = ip_setsockopt,
        .getsockopt        = ip_getsockopt,
@@ -1953,6 +1974,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
                tp->cookie_values = NULL;
        }
+        /* If socket is aborted during connect operation */
+        tcp_free_fastopen_req(tp);
        sk_sockets_allocated_dec(sk);
        sock_release_memcg(sk);
 }
@@ -2593,6 +2617,8 @@ struct proto tcp_prot = {
        .sendmsg                = tcp_sendmsg,
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v4_do_rcv,
+        .release_cb             = tcp_release_cb,
+        .mtu_reduced            = tcp_v4_mtu_reduced,
        .hash                   = inet_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
@@ -2624,13 +2650,11 @@ EXPORT_SYMBOL(tcp_prot);
 static int __net_init tcp_sk_init(struct net *net)
 {
-        return inet_ctl_sock_create(&net->ipv4.tcp_sock,
+        return 0;
-                                    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
 }
 static void __net_exit tcp_sk_exit(struct net *net)
 {
-        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
 }
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..2288a6399e1e
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,745 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+int sysctl_tcp_nometrics_save __read_mostly;
+enum tcp_metric_index {
+        TCP_METRIC_RTT,
+        TCP_METRIC_RTTVAR,
+        TCP_METRIC_SSTHRESH,
+        TCP_METRIC_CWND,
+        TCP_METRIC_REORDERING,
+        /* Always last.  */
+        TCP_METRIC_MAX,
+};
+struct tcp_fastopen_metrics {
+        u16     mss;
+        u16     syn_loss:10;            /* Recurring Fast Open SYN losses */
+        unsigned long   last_syn_loss;  /* Last Fast Open SYN loss */
+        struct  tcp_fastopen_cookie     cookie;
+};
+struct tcp_metrics_block {
+        struct tcp_metrics_block __rcu  *tcpm_next;
+        struct inetpeer_addr            tcpm_addr;
+        unsigned long                   tcpm_stamp;
+        u32                             tcpm_ts;
+        u32                             tcpm_ts_stamp;
+        u32                             tcpm_lock;
+        u32                             tcpm_vals[TCP_METRIC_MAX];
+        struct tcp_fastopen_metrics     tcpm_fastopen;
+};
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+                              enum tcp_metric_index idx)
+{
+        return tm->tcpm_lock & (1 << idx);
+}
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+                          enum tcp_metric_index idx)
+{
+        return tm->tcpm_vals[idx];
+}
+static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
+                                  enum tcp_metric_index idx)
+{
+        return msecs_to_jiffies(tm->tcpm_vals[idx]);
+}
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+                           enum tcp_metric_index idx,
+                           u32 val)
+{
+        tm->tcpm_vals[idx] = val;
+}
+static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
+                                 enum tcp_metric_index idx,
+                                 u32 val)
+{
+        tm->tcpm_vals[idx] = jiffies_to_msecs(val);
+}
+static bool addr_same(const struct inetpeer_addr *a,
+                      const struct inetpeer_addr *b)
+{
+        const struct in6_addr *a6, *b6;
+        if (a->family != b->family)
+                return false;
+        if (a->family == AF_INET)
+                return a->addr.a4 == b->addr.a4;
+        a6 = (const struct in6_addr *) &a->addr.a6[0];
+        b6 = (const struct in6_addr *) &b->addr.a6[0];
+        return ipv6_addr_equal(a6, b6);
+}
+struct tcpm_hash_bucket {
+        struct tcp_metrics_block __rcu  *chain;
+};
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+        u32 val;
+        tm->tcpm_stamp = jiffies;
+        val = 0;
+        if (dst_metric_locked(dst, RTAX_RTT))
+                val |= 1 << TCP_METRIC_RTT;
+        if (dst_metric_locked(dst, RTAX_RTTVAR))
+                val |= 1 << TCP_METRIC_RTTVAR;
+        if (dst_metric_locked(dst, RTAX_SSTHRESH))
+                val |= 1 << TCP_METRIC_SSTHRESH;
+        if (dst_metric_locked(dst, RTAX_CWND))
+                val |= 1 << TCP_METRIC_CWND;
+        if (dst_metric_locked(dst, RTAX_REORDERING))
+                val |= 1 << TCP_METRIC_REORDERING;
+        tm->tcpm_lock = val;
+        tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
+        tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+        tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+        tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+        tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+        tm->tcpm_ts = 0;
+        tm->tcpm_ts_stamp = 0;
+        tm->tcpm_fastopen.mss = 0;
+        tm->tcpm_fastopen.syn_loss = 0;
+        tm->tcpm_fastopen.cookie.len = 0;
+}
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+                                          struct inetpeer_addr *addr,
+                                          unsigned int hash,
+                                          bool reclaim)
+{
+        struct tcp_metrics_block *tm;
+        struct net *net;
+        spin_lock_bh(&tcp_metrics_lock);
+        net = dev_net(dst->dev);
+        if (unlikely(reclaim)) {
+                struct tcp_metrics_block *oldest;
+                oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
+                for (tm = rcu_dereference(oldest->tcpm_next); tm;
+                     tm = rcu_dereference(tm->tcpm_next)) {
+                        if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+                                oldest = tm;
+                }
+                tm = oldest;
+        } else {
+                tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+                if (!tm)
+                        goto out_unlock;
+        }
+        tm->tcpm_addr = *addr;
+        tcpm_suck_dst(tm, dst);
+        if (likely(!reclaim)) {
+                tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
+                rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+        }
+out_unlock:
+        spin_unlock_bh(&tcp_metrics_lock);
+        return tm;
+}
+#define TCP_METRICS_TIMEOUT             (60 * 60 * HZ)
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+        if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+                tcpm_suck_dst(tm, dst);
+}
+#define TCP_METRICS_RECLAIM_DEPTH       5
+#define TCP_METRICS_RECLAIM_PTR         (struct tcp_metrics_block *) 0x1UL
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+        if (tm)
+                return tm;
+        if (depth > TCP_METRICS_RECLAIM_DEPTH)
+                return TCP_METRICS_RECLAIM_PTR;
+        return NULL;
+}
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
+                                                   struct net *net, unsigned int hash)
+{
+        struct tcp_metrics_block *tm;
+        int depth = 0;
+        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+             tm = rcu_dereference(tm->tcpm_next)) {
+                if (addr_same(&tm->tcpm_addr, addr))
+                        break;
+                depth++;
+        }
+        return tcp_get_encode(tm, depth);
+}
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+                                                       struct dst_entry *dst)
+{
+        struct tcp_metrics_block *tm;
+        struct inetpeer_addr addr;
+        unsigned int hash;
+        struct net *net;
+        addr.family = req->rsk_ops->family;
+        switch (addr.family) {
+        case AF_INET:
+                addr.addr.a4 = inet_rsk(req)->rmt_addr;
+                hash = (__force unsigned int) addr.addr.a4;
+                break;
+        case AF_INET6:
+                *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
+                hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
+                break;
+        default:
+                return NULL;
+        }
+        net = dev_net(dst->dev);
+        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+             tm = rcu_dereference(tm->tcpm_next)) {
+                if (addr_same(&tm->tcpm_addr, &addr))
+                        break;
+        }
+        tcpm_check_stamp(tm, dst);
+        return tm;
+}
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+        struct inet6_timewait_sock *tw6;
+        struct tcp_metrics_block *tm;
+        struct inetpeer_addr addr;
+        unsigned int hash;
+        struct net *net;
+        addr.family = tw->tw_family;
+        switch (addr.family) {
+        case AF_INET:
+                addr.addr.a4 = tw->tw_daddr;
+                hash = (__force unsigned int) addr.addr.a4;
+                break;
+        case AF_INET6:
+                tw6 = inet6_twsk((struct sock *)tw);
+                *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
+                hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
+                break;
+        default:
+                return NULL;
+        }
+        net = twsk_net(tw);
+        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+             tm = rcu_dereference(tm->tcpm_next)) {
+                if (addr_same(&tm->tcpm_addr, &addr))
+                        break;
+        }
+        return tm;
+}
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+                                                 struct dst_entry *dst,
+                                                 bool create)
+{
+        struct tcp_metrics_block *tm;
+        struct inetpeer_addr addr;
+        unsigned int hash;
+        struct net *net;
+        bool reclaim;
+        addr.family = sk->sk_family;
+        switch (addr.family) {
+        case AF_INET:
+                addr.addr.a4 = inet_sk(sk)->inet_daddr;
+                hash = (__force unsigned int) addr.addr.a4;
+                break;
+        case AF_INET6:
+                *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
+                hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
+                break;
+        default:
+                return NULL;
+        }
+        net = dev_net(dst->dev);
+        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+        tm = __tcp_get_metrics(&addr, net, hash);
+        reclaim = false;
+        if (tm == TCP_METRICS_RECLAIM_PTR) {
+                reclaim = true;
+                tm = NULL;
+        }
+        if (!tm && create)
+                tm = tcpm_new(dst, &addr, hash, reclaim);
+        else
+                tcpm_check_stamp(tm, dst);
+        return tm;
+}
+/* Save metrics learned by this TCP session.  This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_metrics_block *tm;
+        unsigned long rtt;
+        u32 val;
+        int m;
+        if (sysctl_tcp_nometrics_save || !dst)
+                return;
+        if (dst->flags & DST_HOST)
+                dst_confirm(dst);
+        rcu_read_lock();
+        if (icsk->icsk_backoff || !tp->srtt) {
+                /* This session failed to estimate rtt. Why?
+                 * Probably, no packets returned in time.  Reset our
+                 * results.
+                 */
+                tm = tcp_get_metrics(sk, dst, false);
+                if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+                        tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+                goto out_unlock;
+        } else
+                tm = tcp_get_metrics(sk, dst, true);
+        if (!tm)
+                goto out_unlock;
+        rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+        m = rtt - tp->srtt;
+        /* If newly calculated rtt larger than stored one, store new
+         * one. Otherwise, use EWMA. Remember, rtt overestimation is
+         * always better than underestimation.
+         */
+        if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+                if (m <= 0)
+                        rtt = tp->srtt;
+                else
+                        rtt -= (m >> 3);
+                tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+        }
+        if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+                unsigned long var;
+                if (m < 0)
+                        m = -m;
+                /* Scale deviation to rttvar fixed point */
+                m >>= 1;
+                if (m < tp->mdev)
+                        m = tp->mdev;
+                var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+                if (m >= var)
+                        var = m;
+                else
+                        var -= (var - m) >> 2;
+                tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+        }
+        if (tcp_in_initial_slowstart(tp)) {
+                /* Slow start still did not finish. */
+                if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+                        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+                        if (val && (tp->snd_cwnd >> 1) > val)
+                                tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+                                               tp->snd_cwnd >> 1);
+                }
+                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
+                        if (tp->snd_cwnd > val)
+                                tcp_metric_set(tm, TCP_METRIC_CWND,
+                                               tp->snd_cwnd);
+                }
+        } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+                   icsk->icsk_ca_state == TCP_CA_Open) {
+                /* Cong. avoidance phase, cwnd is reliable. */
+                if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+                        tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+                                       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
+                        tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+                }
+        } else {
+                /* Else slow start did not finish, cwnd is non-sense,
+                 * ssthresh may be also invalid.
+                 */
+                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
+                        tcp_metric_set(tm, TCP_METRIC_CWND,
+                                       (val + tp->snd_ssthresh) >> 1);
+                }
+                if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+                        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+                        if (val && tp->snd_ssthresh > val)
+                                tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+                                               tp->snd_ssthresh);
+                }
+                if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+                        val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+                        if (val < tp->reordering &&
+                            tp->reordering != sysctl_tcp_reordering)
+                                tcp_metric_set(tm, TCP_METRIC_REORDERING,
+                                               tp->reordering);
+                }
+        }
+        tm->tcpm_stamp = jiffies;
+out_unlock:
+        rcu_read_unlock();
+}
+/* Initialize metrics on socket. */
+void tcp_init_metrics(struct sock *sk)
+{
+        struct dst_entry *dst = __sk_dst_get(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_metrics_block *tm;
+        u32 val;
+        if (dst == NULL)
+                goto reset;
+        dst_confirm(dst);
+        rcu_read_lock();
+        tm = tcp_get_metrics(sk, dst, true);
+        if (!tm) {
+                rcu_read_unlock();
+                goto reset;
+        }
+        if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+                tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+        if (val) {
+                tp->snd_ssthresh = val;
+                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
+        } else {
+                /* ssthresh may have been reduced unnecessarily during.
+                 * 3WHS. Restore it back to its initial default.
+                 */
+                tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+        }
+        val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+        if (val && tp->reordering != val) {
+                tcp_disable_fack(tp);
+                tcp_disable_early_retrans(tp);
+                tp->reordering = val;
+        }
+        val = tcp_metric_get(tm, TCP_METRIC_RTT);
+        if (val == 0 || tp->srtt == 0) {
+                rcu_read_unlock();
+                goto reset;
+        }
+        /* Initial rtt is determined from SYN,SYN-ACK.
+         * The segment is small and rtt may appear much
+         * less than real one. Use per-dst memory
+         * to make it more realistic.
+         *
+         * A bit of theory. RTT is time passed after "normal" sized packet
+         * is sent until it is ACKed. In normal circumstances sending small
+         * packets force peer to delay ACKs and calculation is correct too.
+         * The algorithm is adaptive and, provided we follow specs, it
+         * NEVER underestimate RTT. BUT! If peer tries to make some clever
+         * tricks sort of "quick acks" for time long enough to decrease RTT
+         * to low value, and then abruptly stops to do it and starts to delay
+         * ACKs, wait for troubles.
+         */
+        val = msecs_to_jiffies(val);
+        if (val > tp->srtt) {
+                tp->srtt = val;
+                tp->rtt_seq = tp->snd_nxt;
+        }
+        val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+        if (val > tp->mdev) {
+                tp->mdev = val;
+                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+        }
+        rcu_read_unlock();
+        tcp_set_rto(sk);
+reset:
+        if (tp->srtt == 0) {
+                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+                 * 3WHS. This is most likely due to retransmission,
+                 * including spurious one. Reset the RTO back to 3secs
+                 * from the more aggressive 1sec to avoid more spurious
+                 * retransmission.
+                 */
+                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+        }
+        /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+         * retransmitted. In light of RFC6298 more aggressive 1sec
+         * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+         * retransmission has occurred.
+         */
+        if (tp->total_retrans > 1)
+                tp->snd_cwnd = 1;
+        else
+                tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+{
+        struct tcp_metrics_block *tm;
+        bool ret;
+        if (!dst)
+                return false;
+        rcu_read_lock();
+        tm = __tcp_get_metrics_req(req, dst);
+        if (paws_check) {
+                if (tm &&
+                    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+                    (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+                        ret = false;
+                else
+                        ret = true;
+        } else {
+                if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+                        ret = true;
+                else
+                        ret = false;
+        }
+        rcu_read_unlock();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+        struct tcp_metrics_block *tm;
+        rcu_read_lock();
+        tm = tcp_get_metrics(sk, dst, true);
+        if (tm) {
+                struct tcp_sock *tp = tcp_sk(sk);
+                if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+                        tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+                        tp->rx_opt.ts_recent = tm->tcpm_ts;
+                }
+        }
+        rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+        struct dst_entry *dst = __sk_dst_get(sk);
+        bool ret = false;
+        if (dst) {
+                struct tcp_metrics_block *tm;
+                rcu_read_lock();
+                tm = tcp_get_metrics(sk, dst, true);
+                if (tm) {
+                        struct tcp_sock *tp = tcp_sk(sk);
+                        if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+                            ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+                             tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+                                tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+                                tm->tcpm_ts = tp->rx_opt.ts_recent;
+                        }
+                        ret = true;
+                }
+                rcu_read_unlock();
+        }
+        return ret;
+}
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+        struct tcp_metrics_block *tm;
+        bool ret = false;
+        rcu_read_lock();
+        tm = __tcp_get_metrics_tw(tw);
+        if (tm) {
+                const struct tcp_timewait_sock *tcptw;
+                struct sock *sk = (struct sock *) tw;
+                tcptw = tcp_twsk(sk);
+                if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+                    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+                     tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+                        tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+                        tm->tcpm_ts        = tcptw->tw_ts_recent;
+                }
+                ret = true;
+        }
+        rcu_read_unlock();
+        return ret;
+}
+static DEFINE_SEQLOCK(fastopen_seqlock);
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+                            struct tcp_fastopen_cookie *cookie,
+                            int *syn_loss, unsigned long *last_syn_loss)
+{
+        struct tcp_metrics_block *tm;
+        rcu_read_lock();
+        tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+        if (tm) {
+                struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+                unsigned int seq;
+                do {
+                        seq = read_seqbegin(&fastopen_seqlock);
+                        if (tfom->mss)
+                                *mss = tfom->mss;
+                        *cookie = tfom->cookie;
+                        *syn_loss = tfom->syn_loss;
+                        *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
+                } while (read_seqretry(&fastopen_seqlock, seq));
+        }
+        rcu_read_unlock();
+}
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+                            struct tcp_fastopen_cookie *cookie, bool syn_lost)
+{
+        struct tcp_metrics_block *tm;
+        rcu_read_lock();
+        tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
+        if (tm) {
+                struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+                write_seqlock_bh(&fastopen_seqlock);
+                tfom->mss = mss;
+                if (cookie->len > 0)
+                        tfom->cookie = *cookie;
+                if (syn_lost) {
+                        ++tfom->syn_loss;
+                        tfom->last_syn_loss = jiffies;
+                } else
+                        tfom->syn_loss = 0;
+                write_sequnlock_bh(&fastopen_seqlock);
+        }
+        rcu_read_unlock();
+}
+static unsigned int tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+        ssize_t ret;
+        if (!str)
+                return 0;
+        ret = kstrtouint(str, 0, &tcpmhash_entries);
+        if (ret)
+                return 0;
+        return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+        size_t size;
+        unsigned int slots;
+        slots = tcpmhash_entries;
+        if (!slots) {
+                if (totalram_pages >= 128 * 1024)
+                        slots = 16 * 1024;
+                else
+                        slots = 8 * 1024;
+        }
+        net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
+        size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
+        net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
+        if (!net->ipv4.tcp_metrics_hash)
+                return -ENOMEM;
+        return 0;
+}
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+        kfree(net->ipv4.tcp_metrics_hash);
+}
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+        .init   =       tcp_net_metrics_init,
+        .exit   =       tcp_net_metrics_exit,
+};
+void __init tcp_metrics_init(void)
+{
+        register_pernet_subsys(&tcp_net_metrics_ops);
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b85d9fe7d663..5912ac3fd240 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,56 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-static bool tcp_remember_stamp(struct sock *sk)
-{
-        const struct inet_connection_sock *icsk = inet_csk(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct inet_peer *peer;
-        bool release_it;
-        peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
-        if (peer) {
-                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
-                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-                     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
-                        peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
-                        peer->tcp_ts = tp->rx_opt.ts_recent;
-                }
-                if (release_it)
-                        inet_putpeer(peer);
-                return true;
-        }
-        return false;
-}
-static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
-        struct sock *sk = (struct sock *) tw;
-        struct inet_peer *peer;
-        peer = twsk_getpeer(sk);
-        if (peer) {
-                const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
-                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
-                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
-                     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
-                        peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
-                        peer->tcp_ts       = tcptw->tw_ts_recent;
-                }
-                inet_putpeer(peer);
-                return true;
-        }
-        return false;
-}
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
        if (seq == s_win)
@@ -147,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-                tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
                if (tmp_opt.saw_tstamp) {
                        tmp_opt.ts_recent       = tcptw->tw_ts_recent;
@@ -327,8 +277,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        if (tw != NULL) {
                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
                const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+                struct inet_sock *inet = inet_sk(sk);
-                tw->tw_transparent      = inet_sk(sk)->transparent;
+                tw->tw_transparent      = inet->transparent;
                tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
                tcptw->tw_rcv_nxt       = tp->rcv_nxt;
                tcptw->tw_snd_nxt       = tp->snd_nxt;
@@ -403,6 +354,7 @@ void tcp_twsk_destructor(struct sock *sk)
 {
 #ifdef CONFIG_TCP_MD5SIG
        struct tcp_timewait_sock *twsk = tcp_twsk(sk);
        if (twsk->tw_md5_key) {
                tcp_free_md5sig_pool();
                kfree_rcu(twsk->tw_md5_key, rcu);
@@ -435,6 +387,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                struct tcp_sock *oldtp = tcp_sk(sk);
                struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+                newsk->sk_rx_dst = dst_clone(skb_dst(skb));
                /* TCP Cookie Transactions require space for the cookie pair,
                 * as it differs for each connection.  There is no need to
                 * copy any s_data_payload stored at the original socket.
@@ -470,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        treq->snt_isn + 1 + tcp_s_data_size(oldtp);
                tcp_prequeue_init(newtp);
+                INIT_LIST_HEAD(&newtp->tsq_node);
                tcp_init_wl(newtp, treq->rcv_isn);
@@ -579,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
-                tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
                if (tmp_opt.saw_tstamp) {
                        tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 803cbfe82fbc..33cd065cfbd8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
 */
 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
 /* This limits the percentage of the congestion window which we
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+                           int push_one, gfp_t gfp);
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -380,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5              (1 << 2)
 #define OPTION_WSCALE           (1 << 3)
 #define OPTION_COOKIE_EXTENSION (1 << 4)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
 struct tcp_out_options {
-        u8 options;             /* bit field of OPTION_* */
+        u16 options;            /* bit field of OPTION_* */
+        u16 mss;                /* 0 to disable */
        u8 ws;                  /* window scale, 0 to disable */
        u8 num_sack_blocks;     /* number of SACK blocks to include */
        u8 hash_size;           /* bytes in hash_location */
-        u16 mss;                /* 0 to disable */
-        __u32 tsval, tsecr;     /* need to include OPTION_TS */
        __u8 *hash_location;    /* temporary pointer, overloaded */
+        __u32 tsval, tsecr;     /* need to include OPTION_TS */
+        struct tcp_fastopen_cookie *fastopen_cookie;    /* Fast open cookie */
 };
 /* The sysctl int routines are generic, so check consistency here.
@@ -437,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                              struct tcp_out_options *opts)
 {
-        u8 options = opts->options;     /* mungable copy */
+        u16 options = opts->options;    /* mungable copy */
        /* Having both authentication and cookies for security is redundant,
         * and there's certainly not enough room.  Instead, the cookie-less
@@ -559,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                tp->rx_opt.dsack = 0;
        }
+        if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+                struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+                *ptr++ = htonl((TCPOPT_EXP << 24) |
+                               ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+                               TCPOPT_FASTOPEN_MAGIC);
+                memcpy(ptr, foc->val, foc->len);
+                if ((foc->len & 3) == 2) {
+                        u8 *align = ((u8 *)ptr) + foc->len;
+                        align[0] = align[1] = TCPOPT_NOP;
+                }
+                ptr += (foc->len + 3) >> 2;
+        }
 }
 /* Compute TCP options for SYN packets. This is not the final
@@ -574,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
        u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
                         tcp_cookie_size_check(cvp->cookie_desired) :
                         0;
+        struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 #ifdef CONFIG_TCP_MD5SIG
        *md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -614,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
+        if (fastopen && fastopen->cookie.len >= 0) {
+                u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+                need = (need + 3) & ~3U;  /* Align to 32 bits */
+                if (remaining >= need) {
+                        opts->options |= OPTION_FAST_OPEN_COOKIE;
+                        opts->fastopen_cookie = &fastopen->cookie;
+                        remaining -= need;
+                        tp->syn_fastopen = 1;
+                }
+        }
        /* Note that timestamps are required by the specification.
         *
         * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -783,6 +816,156 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
        return size;
 }
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+        struct tasklet_struct   tasklet;
+        struct list_head        head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+static void tcp_tsq_handler(struct sock *sk)
+{
+        if ((1 << sk->sk_state) &
+            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+             TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
+                tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
+}
+/*
+ * One tasklest per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transfering tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+        struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+        LIST_HEAD(list);
+        unsigned long flags;
+        struct list_head *q, *n;
+        struct tcp_sock *tp;
+        struct sock *sk;
+        local_irq_save(flags);
+        list_splice_init(&tsq->head, &list);
+        local_irq_restore(flags);
+        list_for_each_safe(q, n, &list) {
+                tp = list_entry(q, struct tcp_sock, tsq_node);
+                list_del(&tp->tsq_node);
+                sk = (struct sock *)tp;
+                bh_lock_sock(sk);
+                if (!sock_owned_by_user(sk)) {
+                        tcp_tsq_handler(sk);
+                } else {
+                        /* defer the work to tcp_release_cb() */
+                        set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+                }
+                bh_unlock_sock(sk);
+                clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+                sk_free(sk);
+        }
+}
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |           \
+                          (1UL << TCP_WRITE_TIMER_DEFERRED) |   \
+                          (1UL << TCP_DELACK_TIMER_DEFERRED) |  \
+                          (1UL << TCP_MTU_REDUCED_DEFERRED))
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned long flags, nflags;
+        /* perform an atomic operation only if at least one flag is set */
+        do {
+                flags = tp->tsq_flags;
+                if (!(flags & TCP_DEFERRED_ALL))
+                        return;
+                nflags = flags & ~TCP_DEFERRED_ALL;
+        } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+        if (flags & (1UL << TCP_TSQ_DEFERRED))
+                tcp_tsq_handler(sk);
+        if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED))
+                tcp_write_timer_handler(sk);
+        if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
+                tcp_delack_timer_handler(sk);
+        if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
+                sk->sk_prot->mtu_reduced(sk);
+}
+EXPORT_SYMBOL(tcp_release_cb);
+void __init tcp_tasklet_init(void)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+                INIT_LIST_HEAD(&tsq->head);
+                tasklet_init(&tsq->tasklet,
+                             tcp_tasklet_func,
+                             (unsigned long)tsq);
+        }
+}
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We cant xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+        struct sock *sk = skb->sk;
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+            !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+                unsigned long flags;
+                struct tsq_tasklet *tsq;
+                /* Keep a ref on socket.
+                 * This last ref will be released in tcp_tasklet_func()
+                 */
+                atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+                /* queue this socket to tasklet queue */
+                local_irq_save(flags);
+                tsq = &__get_cpu_var(tsq_tasklet);
+                list_add(&tp->tsq_node, &tsq->head);
+                tasklet_schedule(&tsq->tasklet);
+                local_irq_restore(flags);
+        } else {
+                sock_wfree(skb);
+        }
+}
 /* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
@@ -844,7 +1027,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
-        skb_set_owner_w(skb, sk);
+        skb_orphan(skb);
+        skb->sk = sk;
+        skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+                          tcp_wfree : sock_wfree;
+        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
        /* Build TCP header and checksum it. */
        th = tcp_hdr(skb);
@@ -1780,6 +1968,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
                tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
                BUG_ON(!tso_segs);
@@ -1800,6 +1989,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                break;
                }
+                /* TSQ : sk_wmem_alloc accounts skb truesize,
+                 * including skb overhead. But thats OK.
+                 */
+                if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+                        break;
+                }
                limit = mss_now;
                if (tso_segs > 1 && !tcp_urg_mode(tp))
                        limit = tcp_mss_split_point(sk, skb, mss_now,
@@ -2442,7 +2638,16 @@ int tcp_send_synack(struct sock *sk)
        return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
-/* Prepare a SYN-ACK. */
+/**
+ * tcp_make_synack - Prepare a SYN-ACK.
+ * sk: listener socket
+ * dst: dst entry attached to the SYNACK
+ * req: request_sock pointer
+ * rvp: request_values pointer
+ *
+ * Allocate one skb and build a SYNACK packet.
+ * @dst is consumed : Caller should not use it again.
+ */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct request_values *rvp)
@@ -2461,14 +2666,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
                s_data_desired = cvp->s_data_desired;
-        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
+        skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
-        if (skb == NULL)
+        if (unlikely(!skb)) {
+                dst_release(dst);
                return NULL;
+        }
        /* Reserve space for headers. */
        skb_reserve(skb, MAX_TCP_HEADER);
-        skb_dst_set(skb, dst_clone(dst));
+        skb_dst_set(skb, dst);
        mss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2645,6 +2851,109 @@ void tcp_connect_init(struct sock *sk)
        tcp_clear_retrans(tp);
 }
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+        tcb->end_seq += skb->len;
+        skb_header_release(skb);
+        __tcp_add_write_queue_tail(sk, skb);
+        sk->sk_wmem_queued += skb->truesize;
+        sk_mem_charge(sk, skb->truesize);
+        tp->write_seq = tcb->end_seq;
+        tp->packets_out += tcp_skb_pcount(skb);
+}
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_fastopen_request *fo = tp->fastopen_req;
+        int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
+        struct sk_buff *syn_data = NULL, *data;
+        unsigned long last_syn_loss = 0;
+        tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
+        tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+                               &syn_loss, &last_syn_loss);
+        /* Recurring FO SYN losses: revert to regular handshake temporarily */
+        if (syn_loss > 1 &&
+            time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+                fo->cookie.len = -1;
+                goto fallback;
+        }
+        if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+                fo->cookie.len = -1;
+        else if (fo->cookie.len <= 0)
+                goto fallback;
+        /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+         * user-MSS. Reserve maximum option space for middleboxes that add
+         * private TCP options. The cost is reduced data space in SYN :(
+         */
+        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+        space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+                MAX_TCP_OPTION_SPACE;
+        syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+                                   sk->sk_allocation);
+        if (syn_data == NULL)
+                goto fallback;
+        for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+                struct iovec *iov = &fo->data->msg_iov[i];
+                unsigned char __user *from = iov->iov_base;
+                int len = iov->iov_len;
+                if (syn_data->len + len > space)
+                        len = space - syn_data->len;
+                else if (i + 1 == iovlen)
+                        /* No more data pending in inet_wait_for_connect() */
+                        fo->data = NULL;
+                if (skb_add_data(syn_data, from, len))
+                        goto fallback;
+        }
+        /* Queue a data-only packet after the regular SYN for retransmission */
+        data = pskb_copy(syn_data, sk->sk_allocation);
+        if (data == NULL)
+                goto fallback;
+        TCP_SKB_CB(data)->seq++;
+        TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+        TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+        tcp_connect_queue_skb(sk, data);
+        fo->copied = data->len;
+        if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+                tp->syn_data = (fo->copied > 0);
+                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+                goto done;
+        }
+        syn_data = NULL;
+fallback:
+        /* Send a regular SYN with Fast Open cookie request option */
+        if (fo->cookie.len > 0)
+                fo->cookie.len = 0;
+        err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+        if (err)
+                tp->syn_fastopen = 0;
+        kfree_skb(syn_data);
+done:
+        fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
+        return err;
+}
 /* Build a SYN and send it off. */
 int tcp_connect(struct sock *sk)
 {
@@ -2662,17 +2971,13 @@ int tcp_connect(struct sock *sk)
        skb_reserve(buff, MAX_TCP_HEADER);
        tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+        tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+        tcp_connect_queue_skb(sk, buff);
        TCP_ECN_send_syn(sk, buff);
-        /* Send it off. */
+        /* Send off SYN; include data in Fast Open. */
-        TCP_SKB_CB(buff)->when = tcp_time_stamp;
+        err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
-        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+              tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
-        skb_header_release(buff);
-        __tcp_add_write_queue_tail(sk, buff);
-        sk->sk_wmem_queued += buff->truesize;
-        sk_mem_charge(sk, buff->truesize);
-        tp->packets_out += tcp_skb_pcount(buff);
-        err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
        if (err == -ECONNREFUSED)
                return err;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e911e6c523ec..6df36ad55a38 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,17 +32,6 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
-static void tcp_write_timer(unsigned long);
-static void tcp_delack_timer(unsigned long);
-static void tcp_keepalive_timer (unsigned long data);
-void tcp_init_xmit_timers(struct sock *sk)
-{
-        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
-                                  &tcp_keepalive_timer);
-}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
 static void tcp_write_err(struct sock *sk)
 {
        sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -205,21 +194,11 @@ static int tcp_write_timeout(struct sock *sk)
        return 0;
 }
-static void tcp_delack_timer(unsigned long data)
+void tcp_delack_timer_handler(struct sock *sk)
 {
-        struct sock *sk = (struct sock *)data;
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
-        bh_lock_sock(sk);
-        if (sock_owned_by_user(sk)) {
-                /* Try again later. */
-                icsk->icsk_ack.blocked = 1;
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
-                sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
-                goto out_unlock;
-        }
        sk_mem_reclaim_partial(sk);
        if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -260,7 +239,21 @@ static void tcp_delack_timer(unsigned long data)
 out:
        if (sk_under_memory_pressure(sk))
                sk_mem_reclaim(sk);
-out_unlock:
+}
+static void tcp_delack_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock *)data;
+        bh_lock_sock(sk);
+        if (!sock_owned_by_user(sk)) {
+                tcp_delack_timer_handler(sk);
+        } else {
+                inet_csk(sk)->icsk_ack.blocked = 1;
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+                /* deleguate our work to tcp_release_cb() */
+                set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+        }
        bh_unlock_sock(sk);
        sock_put(sk);
 }
@@ -450,19 +443,11 @@ out_reset_timer:
 out:;
 }
-static void tcp_write_timer(unsigned long data)
+void tcp_write_timer_handler(struct sock *sk)
 {
-        struct sock *sk = (struct sock *)data;
        struct inet_connection_sock *icsk = inet_csk(sk);
        int event;
-        bh_lock_sock(sk);
-        if (sock_owned_by_user(sk)) {
-                /* Try again later */
-                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
-                goto out_unlock;
-        }
        if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
                goto out;
@@ -485,7 +470,19 @@ static void tcp_write_timer(unsigned long data)
 out:
        sk_mem_reclaim(sk);
-out_unlock:
+}
+static void tcp_write_timer(unsigned long data)
+{
+        struct sock *sk = (struct sock *)data;
+        bh_lock_sock(sk);
+        if (!sock_owned_by_user(sk)) {
+                tcp_write_timer_handler(sk);
+        } else {
+                /* deleguate our work to tcp_release_cb() */
+                set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags);
+        }
        bh_unlock_sock(sk);
        sock_put(sk);
 }
@@ -602,3 +599,10 @@ out:
        bh_unlock_sock(sk);
        sock_put(sk);
 }
+void tcp_init_xmit_timers(struct sock *sk)
+{
+        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+                                  &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eaca73644e79..b4c3582a991f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -108,6 +108,7 @@
 #include <net/xfrm.h>
 #include <trace/events/udp.h>
 #include <linux/static_key.h>
+#include <trace/events/skb.h>
 #include "udp_impl.h"
 struct udp_table udp_table __read_mostly;
@@ -615,6 +616,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
                break;
        case ICMP_DEST_UNREACH:
                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+                        ipv4_sk_update_pmtu(skb, sk, info);
                        if (inet->pmtudisc != IP_PMTUDISC_DONT) {
                                err = EMSGSIZE;
                                harderr = 1;
@@ -628,6 +630,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
                        err = icmp_err_convert[code].errno;
                }
                break;
+        case ICMP_REDIRECT:
+                ipv4_sk_redirect(skb, sk);
+                break;
        }
        /*
@@ -1219,8 +1224,10 @@ try_again:
                        goto csum_copy_err;
        }
-        if (err)
+        if (unlikely(err)) {
+                trace_kfree_skb(skb, udp_recvmsg);
                goto out_free;
+        }
        if (!peeked)
                UDP_INC_STATS_USER(sock_net(sk),
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index a7f86a3cd502..16d0960062be 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
        int err = -EINVAL;
        struct sock *sk;
        struct sk_buff *rep;
+        struct net *net = sock_net(in_skb->sk);
        if (req->sdiag_family == AF_INET)
-                sk = __udp4_lib_lookup(&init_net,
+                sk = __udp4_lib_lookup(net,
                                req->id.idiag_src[0], req->id.idiag_sport,
                                req->id.idiag_dst[0], req->id.idiag_dport,
                                req->id.idiag_if, tbl);
 #if IS_ENABLED(CONFIG_IPV6)
        else if (req->sdiag_family == AF_INET6)
-                sk = __udp6_lib_lookup(&init_net,
+                sk = __udp6_lib_lookup(net,
                                (struct in6_addr *)req->id.idiag_src,
                                req->id.idiag_sport,
                                (struct in6_addr *)req->id.idiag_dst,
@@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
                kfree_skb(rep);
                goto out;
        }
-        err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
                              MSG_DONTWAIT);
        if (err > 0)
                err = 0;
@@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
                struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
        int num, s_num, slot, s_slot;
+        struct net *net = sock_net(skb->sk);
        s_slot = cb->args[0];
        num = s_num = cb->args[1];
@@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
                sk_nulls_for_each(sk, node, &hslot->head) {
                        struct inet_sock *inet = inet_sk(sk);
+                        if (!net_eq(sock_net(sk), net))
+                                continue;
                        if (num < s_num)
                                goto next;
                        if (!(r->idiag_states & (1 << sk->sk_state)))
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216dc..58d23a572509 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
        if (skb_dst(skb) == NULL) {
                const struct iphdr *iph = ip_hdr(skb);
-                if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+                if (ip_route_input(skb, iph->daddr, iph->saddr,
-                                         iph->tos, skb->dev))
+                                   iph->tos, skb->dev))
                        goto drop;
        }
        return dst_input(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ed4bf11ef9f4..ddee0a099a2c 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,6 +15,65 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
+/* Informational hook. The decap is still done here. */
+static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
+int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
+{
+        struct xfrm_tunnel __rcu **pprev;
+        struct xfrm_tunnel *t;
+        int ret = -EEXIST;
+        int priority = handler->priority;
+        mutex_lock(&xfrm4_mode_tunnel_input_mutex);
+        for (pprev = &rcv_notify_handlers;
+             (t = rcu_dereference_protected(*pprev,
+             lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
+             pprev = &t->next) {
+                if (t->priority > priority)
+                        break;
+                if (t->priority == priority)
+                        goto err;
+        }
+        handler->next = *pprev;
+        rcu_assign_pointer(*pprev, handler);
+        ret = 0;
+err:
+        mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
+int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
+{
+        struct xfrm_tunnel __rcu **pprev;
+        struct xfrm_tunnel *t;
+        int ret = -ENOENT;
+        mutex_lock(&xfrm4_mode_tunnel_input_mutex);
+        for (pprev = &rcv_notify_handlers;
+             (t = rcu_dereference_protected(*pprev,
+             lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
+             pprev = &t->next) {
+                if (t == handler) {
+                        *pprev = handler->next;
+                        ret = 0;
+                        break;
+                }
+        }
+        mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
+        synchronize_net();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
 static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
 {
        struct iphdr *inner_iph = ipip_hdr(skb);
@@ -64,8 +123,14 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
        return 0;
 }
+#define for_each_input_rcu(head, handler)       \
+        for (handler = rcu_dereference(head);   \
+             handler != NULL;                   \
+             handler = rcu_dereference(handler->next))
 static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
+        struct xfrm_tunnel *handler;
        int err = -EINVAL;
        if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -74,6 +139,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto out;
+        for_each_input_rcu(rcv_notify_handlers, handler)
+                handler->handler(skb);
        if (skb_cloned(skb) &&
            (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
                goto out;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 0d3426cb5c4f..c6281847f16a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,30 +79,19 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
        struct rtable *rt = (struct rtable *)xdst->route;
        const struct flowi4 *fl4 = &fl->u.ip4;
-        xdst->u.rt.rt_key_dst = fl4->daddr;
-        xdst->u.rt.rt_key_src = fl4->saddr;
-        xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
-        xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
        xdst->u.rt.rt_iif = fl4->flowi4_iif;
-        xdst->u.rt.rt_oif = fl4->flowi4_oif;
-        xdst->u.rt.rt_mark = fl4->flowi4_mark;
        xdst->u.dst.dev = dev;
        dev_hold(dev);
-        xdst->u.rt.peer = rt->peer;
-        if (rt->peer)
-                atomic_inc(&rt->peer->refcnt);
        /* Sheit... I remember I did this right. Apparently,
         * it was magically lost, so this code needs audit */
+        xdst->u.rt.rt_is_input = rt->rt_is_input;
        xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
                                              RTCF_LOCAL);
        xdst->u.rt.rt_type = rt->rt_type;
-        xdst->u.rt.rt_src = rt->rt_src;
-        xdst->u.rt.rt_dst = rt->rt_dst;
        xdst->u.rt.rt_gateway = rt->rt_gateway;
-        xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
+        xdst->u.rt.rt_pmtu = rt->rt_pmtu;
        return 0;
 }
@@ -198,12 +187,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
        return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
-static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+                              struct sk_buff *skb, u32 mtu)
+{
+        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+        struct dst_entry *path = xdst->route;
+        path->ops->update_pmtu(path, sk, skb, mtu);
+}
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+                           struct sk_buff *skb)
 {
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
        struct dst_entry *path = xdst->route;
-        path->ops->update_pmtu(path, mtu);
+        path->ops->redirect(path, sk, skb);
 }
 static void xfrm4_dst_destroy(struct dst_entry *dst)
@@ -212,9 +211,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
        dst_destroy_metrics_generic(dst);
-        if (likely(xdst->u.rt.peer))
-                inet_putpeer(xdst->u.rt.peer);
        xfrm_dst_destroy(xdst);
 }
@@ -232,6 +228,7 @@ static struct dst_ops xfrm4_dst_ops = {
        .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   xfrm4_garbage_collect,
        .update_pmtu =          xfrm4_update_pmtu,
+        .redirect =             xfrm4_redirect,
        .cow_metrics =          dst_cow_metrics_generic,
        .destroy =              xfrm4_dst_destroy,
        .ifdown =               xfrm4_dst_ifdown,