Patched in Tegra support.

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
commit: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree: a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ipv4
parent: 406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
92 files changed, 5920 insertions, 11321 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 5a19aeb8609..cbb505ba932 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -163,6 +163,8 @@ config IP_PNP_RARP
          operating on your network. Read
          <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+# not yet ready..
+#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP
 config NET_IPIP
        tristate "IP: tunneling"
        select INET_TUNNEL
@@ -262,8 +264,8 @@ config ARPD
        bool "IP: ARP daemon support"
        ---help---
          The kernel maintains an internal cache which maps IP addresses to
-          hardware addresses on the local network, so that Ethernet
+          hardware addresses on the local network, so that Ethernet/Token Ring/
-          frames are sent to the proper address on the physical networking
+          etc. frames are sent to the proper address on the physical networking
          layer. Normally, kernel uses the ARP protocol to resolve these
          mappings.
@@ -310,20 +312,9 @@ config SYN_COOKIES
          If unsure, say N.
-config NET_IPVTI
-        tristate "Virtual (secure) IP: tunneling"
-        select INET_TUNNEL
-        depends on INET_XFRM_MODE_TUNNEL
-        ---help---
-          Tunneling means encapsulating data of one protocol type within
-          another protocol and sending it over a channel that understands the
-          encapsulating protocol. This can be used with xfrm mode tunnel to give
-          the notion of a secure tunnel for IPSEC and then use routing protocol
-          on top.
 config INET_AH
        tristate "IP: AH transformation"
-        select XFRM_ALGO
+        select XFRM
        select CRYPTO
        select CRYPTO_HMAC
        select CRYPTO_MD5
@@ -335,7 +326,7 @@ config INET_AH
 config INET_ESP
        tristate "IP: ESP transformation"
-        select XFRM_ALGO
+        select XFRM
        select CRYPTO
        select CRYPTO_AUTHENC
        select CRYPTO_HMAC
@@ -418,14 +409,6 @@ config INET_TCP_DIAG
        depends on INET_DIAG
        def_tristate INET_DIAG
-config INET_UDP_DIAG
-        tristate "UDP: socket monitoring interface"
-        depends on INET_DIAG && (IPV6 || IPV6=n)
-        default n
-        ---help---
-          Support for UDP socket monitoring interface used by the ss tool.
-          If unsure, say Y.
 menuconfig TCP_CONG_ADVANCED
        bool "TCP: advanced congestion control"
        ---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 15ca63ec604..681084d76a9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,20 +7,20 @@ obj-y     := route.o inetpeer.o protocol.o \
             ip_output.o ip_sockglue.o inet_hashtables.o \
             inet_timewait_sock.o inet_connection_sock.o \
             tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-             tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+             tcp_minisocks.o tcp_cong.o \
             datagram.o raw.o udp.o udplite.o \
             arp.o icmp.o devinet.o af_inet.o  igmp.o \
             fib_frontend.o fib_semantics.o fib_trie.o \
             inet_fragment.o ping.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
-obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
 obj-$(CONFIG_INET_ESP) += esp4.o
@@ -35,7 +35,6 @@ obj-$(CONFIG_IP_PNP) += ipconfig.o
 obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
-obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -49,7 +48,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 24b384b7903..bf488051a8d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -65,8 +65,6 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define pr_fmt(fmt) "IPv4: " fmt
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/types.h>
@@ -91,6 +89,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/inet.h>
 #include <linux/igmp.h>
@@ -119,6 +118,19 @@
 #include <linux/mroute.h>
 #endif
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+static inline int current_has_network(void)
+{
+        return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+        return 1;
+}
+#endif
 /* The inetsw table contains everything that inet_create needs to
 * build a new socket.
@@ -157,7 +169,6 @@ void inet_sock_destruct(struct sock *sk)
        kfree(rcu_dereference_protected(inet->inet_opt, 1));
        dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
-        dst_release(sk->sk_rx_dst);
        sk_refcnt_debug_dec(sk);
 }
 EXPORT_SYMBOL(inet_sock_destruct);
@@ -212,26 +223,6 @@ int inet_listen(struct socket *sock, int backlog)
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
-                /* Check special setups for testing purpose to enable TFO w/o
-                 * requiring TCP_FASTOPEN sockopt.
-                 * Note that only TCP sockets (SOCK_STREAM) will reach here.
-                 * Also fastopenq may already been allocated because this
-                 * socket was in TCP_LISTEN state previously but was
-                 * shutdown() (rather than close()).
-                 */
-                if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
-                    inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
-                        if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
-                                err = fastopen_init_queue(sk, backlog);
-                        else if ((sysctl_tcp_fastopen &
-                                  TFO_SERVER_WO_SOCKOPT2) != 0)
-                                err = fastopen_init_queue(sk,
-                                    ((uint)sysctl_tcp_fastopen) >> 16);
-                        else
-                                err = 0;
-                        if (err)
-                                goto out;
-                }
                err = inet_csk_listen_start(sk, backlog);
                if (err)
                        goto out;
@@ -263,21 +254,24 @@ void build_ehash_secret(void)
 }
 EXPORT_SYMBOL(build_ehash_secret);
-static inline int inet_netns_ok(struct net *net, __u8 protocol)
+static inline int inet_netns_ok(struct net *net, int protocol)
 {
+        int hash;
        const struct net_protocol *ipprot;
        if (net_eq(net, &init_net))
                return 1;
-        ipprot = rcu_dereference(inet_protos[protocol]);
+        hash = protocol & (MAX_INET_PROTOS - 1);
-        if (ipprot == NULL) {
+        ipprot = rcu_dereference(inet_protos[hash]);
+        if (ipprot == NULL)
                /* raw IP is OK */
                return 1;
-        }
        return ipprot->netns_ok;
 }
 /*
 *      Create an inet socket.
 */
@@ -294,6 +288,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
        int try_loading_module = 0;
        int err;
+        if (!current_has_network())
+                return -EACCES;
        if (unlikely(!inet_ehash_secret))
                if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                        build_ehash_secret();
@@ -346,8 +343,7 @@ lookup_protocol:
        }
        err = -EPERM;
-        if (sock->type == SOCK_RAW && !kern &&
+        if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
-            !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out_rcu_unlock;
        err = -EAFNOSUPPORT;
@@ -370,7 +366,7 @@ lookup_protocol:
        err = 0;
        sk->sk_no_check = answer_no_check;
        if (INET_PROTOSW_REUSE & answer_flags)
-                sk->sk_reuse = SK_CAN_REUSE;
+                sk->sk_reuse = 1;
        inet = inet_sk(sk);
        inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -402,7 +398,6 @@ lookup_protocol:
        inet->mc_all    = 1;
        inet->mc_index  = 0;
        inet->mc_list   = NULL;
-        inet->rcv_tos   = 0;
        sk_refcnt_debug_inc(sk);
@@ -474,7 +469,6 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
        struct sock *sk = sock->sk;
        struct inet_sock *inet = inet_sk(sk);
-        struct net *net = sock_net(sk);
        unsigned short snum;
        int chk_addr_ret;
        int err;
@@ -498,7 +492,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                        goto out;
        }
-        chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+        chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
        /* Not specified by any standard per-se, however it breaks too
         * many applications when removed.  It is unfortunate since
@@ -518,8 +512,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
        snum = ntohs(addr->sin_port);
        err = -EACCES;
-        if (snum && snum < PROT_SOCK &&
+        if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
-            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                goto out;
        /*      We keep a pair of addresses. rcv_saddr is the one
@@ -563,7 +556,7 @@ out:
 }
 EXPORT_SYMBOL(inet_bind);
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
                       int addr_len, int flags)
 {
        struct sock *sk = sock->sk;
@@ -575,16 +568,15 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
        if (!inet_sk(sk)->inet_num && inet_autobind(sk))
                return -EAGAIN;
-        return sk->sk_prot->connect(sk, uaddr, addr_len);
+        return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
 }
 EXPORT_SYMBOL(inet_dgram_connect);
-static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
+static long inet_wait_for_connect(struct sock *sk, long timeo)
 {
        DEFINE_WAIT(wait);
        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-        sk->sk_write_pending += writebias;
        /* Basic assumption: if someone sets sk->sk_err, he _must_
         * change state of the socket from TCP_SYN_*.
@@ -600,7 +592,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
        }
        finish_wait(sk_sleep(sk), &wait);
-        sk->sk_write_pending -= writebias;
        return timeo;
 }
@@ -608,8 +599,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
 *      Connect to a remote host. There is regrettably still a little
 *      TCP 'magic' in here.
 */
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-                          int addr_len, int flags)
+                        int addr_len, int flags)
 {
        struct sock *sk = sock->sk;
        int err;
@@ -618,6 +609,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        if (addr_len < sizeof(uaddr->sa_family))
                return -EINVAL;
+        lock_sock(sk);
        if (uaddr->sa_family == AF_UNSPEC) {
                err = sk->sk_prot->disconnect(sk, flags);
                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -657,12 +650,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-                int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
-                                tcp_sk(sk)->fastopen_req &&
-                                tcp_sk(sk)->fastopen_req->data ? 1 : 0;
                /* Error code is set above */
-                if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
+                if (!timeo || !inet_wait_for_connect(sk, timeo))
                        goto out;
                err = sock_intr_errno(timeo);
@@ -684,6 +673,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
        sock->state = SS_CONNECTED;
        err = 0;
 out:
+        release_sock(sk);
        return err;
 sock_error:
@@ -693,18 +683,6 @@ sock_error:
                sock->state = SS_DISCONNECTING;
        goto out;
 }
-EXPORT_SYMBOL(__inet_stream_connect);
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-                        int addr_len, int flags)
-{
-        int err;
-        lock_sock(sock->sk);
-        err = __inet_stream_connect(sock, uaddr, addr_len, flags);
-        release_sock(sock->sk);
-        return err;
-}
 EXPORT_SYMBOL(inet_stream_connect);
 /*
@@ -724,8 +702,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
        sock_rps_record_flow(sk2);
        WARN_ON(!((1 << sk2->sk_state) &
-                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+                  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
-                  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
        sock_graft(sk2, newsock);
@@ -919,6 +896,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
        case SIOCSIFPFLAGS:
        case SIOCGIFPFLAGS:
        case SIOCSIFFLAGS:
+        case SIOCKILLADDR:
                err = devinet_ioctl(net, cmd, (void __user *)arg);
                break;
        default:
@@ -933,7 +911,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 EXPORT_SYMBOL(inet_ioctl);
 #ifdef CONFIG_COMPAT
-static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
        struct sock *sk = sock->sk;
        int err = -ENOIOCTLCMD;
@@ -1124,11 +1102,13 @@ out:
        return;
 out_permanent:
-        pr_err("Attempt to override permanent protocol %d\n", protocol);
+        printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+               protocol);
        goto out;
 out_illegal:
-        pr_err("Ignoring attempt to register invalid socket type %d\n",
+        printk(KERN_ERR
+               "Ignoring attempt to register invalid socket type %d.\n",
               p->type);
        goto out;
 }
@@ -1137,7 +1117,8 @@ EXPORT_SYMBOL(inet_register_protosw);
 void inet_unregister_protosw(struct inet_protosw *p)
 {
        if (INET_PROTOSW_PERMANENT & p->flags) {
-                pr_err("Attempt to unregister permanent protocol %d\n",
+                printk(KERN_ERR
+                       "Attempt to unregister permanent protocol %d.\n",
                       p->protocol);
        } else {
                spin_lock_bh(&inetsw_lock);
@@ -1186,8 +1167,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
                return 0;
        if (sysctl_ip_dynaddr > 1) {
-                pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+                printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
-                        __func__, &old_saddr, &new_saddr);
+                       __func__, &old_saddr, &new_saddr);
        }
        inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
@@ -1254,8 +1235,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 static int inet_gso_send_check(struct sk_buff *skb)
 {
-        const struct net_offload *ops;
        const struct iphdr *iph;
+        const struct net_protocol *ops;
        int proto;
        int ihl;
        int err = -EINVAL;
@@ -1274,25 +1255,24 @@ static int inet_gso_send_check(struct sk_buff *skb)
        __skb_pull(skb, ihl);
        skb_reset_transport_header(skb);
        iph = ip_hdr(skb);
-        proto = iph->protocol;
+        proto = iph->protocol & (MAX_INET_PROTOS - 1);
        err = -EPROTONOSUPPORT;
        rcu_read_lock();
-        ops = rcu_dereference(inet_offloads[proto]);
+        ops = rcu_dereference(inet_protos[proto]);
-        if (likely(ops && ops->callbacks.gso_send_check))
+        if (likely(ops && ops->gso_send_check))
-                err = ops->callbacks.gso_send_check(skb);
+                err = ops->gso_send_check(skb);
        rcu_read_unlock();
 out:
        return err;
 }
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
-        netdev_features_t features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
-        const struct net_offload *ops;
        struct iphdr *iph;
+        const struct net_protocol *ops;
        int proto;
        int ihl;
        int id;
@@ -1324,13 +1304,13 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
        skb_reset_transport_header(skb);
        iph = ip_hdr(skb);
        id = ntohs(iph->id);
-        proto = iph->protocol;
+        proto = iph->protocol & (MAX_INET_PROTOS - 1);
        segs = ERR_PTR(-EPROTONOSUPPORT);
        rcu_read_lock();
-        ops = rcu_dereference(inet_offloads[proto]);
+        ops = rcu_dereference(inet_protos[proto]);
-        if (likely(ops && ops->callbacks.gso_segment))
+        if (likely(ops && ops->gso_segment))
-                segs = ops->callbacks.gso_segment(skb, features);
+                segs = ops->gso_segment(skb, features);
        rcu_read_unlock();
        if (!segs || IS_ERR(segs))
@@ -1359,7 +1339,7 @@ out:
 static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                                         struct sk_buff *skb)
 {
-        const struct net_offload *ops;
+        const struct net_protocol *ops;
        struct sk_buff **pp = NULL;
        struct sk_buff *p;
        const struct iphdr *iph;
@@ -1378,17 +1358,17 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                        goto out;
        }
-        proto = iph->protocol;
+        proto = iph->protocol & (MAX_INET_PROTOS - 1);
        rcu_read_lock();
-        ops = rcu_dereference(inet_offloads[proto]);
+        ops = rcu_dereference(inet_protos[proto]);
-        if (!ops || !ops->callbacks.gro_receive)
+        if (!ops || !ops->gro_receive)
                goto out_unlock;
        if (*(u8 *)iph != 0x45)
                goto out_unlock;
-        if (unlikely(ip_fast_csum((u8 *)iph, 5)))
+        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                goto out_unlock;
        id = ntohl(*(__be32 *)&iph->id);
@@ -1404,6 +1384,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                iph2 = ip_hdr(p);
                if ((iph->protocol ^ iph2->protocol) |
+                    (iph->tos ^ iph2->tos) |
                    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
                    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
@@ -1413,7 +1394,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
                /* All fields must match except length and checksum. */
                NAPI_GRO_CB(p)->flush |=
                        (iph->ttl ^ iph2->ttl) |
-                        (iph->tos ^ iph2->tos) |
                        ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
                NAPI_GRO_CB(p)->flush |= flush;
@@ -1423,7 +1403,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
        skb_gro_pull(skb, sizeof(*iph));
        skb_set_transport_header(skb, skb_gro_offset(skb));
-        pp = ops->callbacks.gro_receive(head, skb);
+        pp = ops->gro_receive(head, skb);
 out_unlock:
        rcu_read_unlock();
@@ -1436,21 +1416,21 @@ out:
 static int inet_gro_complete(struct sk_buff *skb)
 {
-        __be16 newlen = htons(skb->len - skb_network_offset(skb));
+        const struct net_protocol *ops;
        struct iphdr *iph = ip_hdr(skb);
-        const struct net_offload *ops;
+        int proto = iph->protocol & (MAX_INET_PROTOS - 1);
-        int proto = iph->protocol;
        int err = -ENOSYS;
+        __be16 newlen = htons(skb->len - skb_network_offset(skb));
        csum_replace2(&iph->check, iph->tot_len, newlen);
        iph->tot_len = newlen;
        rcu_read_lock();
-        ops = rcu_dereference(inet_offloads[proto]);
+        ops = rcu_dereference(inet_protos[proto]);
-        if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+        if (WARN_ON(!ops || !ops->gro_complete))
                goto out_unlock;
-        err = ops->callbacks.gro_complete(skb);
+        err = ops->gro_complete(skb);
 out_unlock:
        rcu_read_unlock();
@@ -1558,36 +1538,25 @@ static const struct net_protocol igmp_protocol = {
 #endif
 static const struct net_protocol tcp_protocol = {
-        .early_demux    =       tcp_v4_early_demux,
+        .handler =      tcp_v4_rcv,
-        .handler        =       tcp_v4_rcv,
+        .err_handler =  tcp_v4_err,
-        .err_handler    =       tcp_v4_err,
+        .gso_send_check = tcp_v4_gso_send_check,
-        .no_policy      =       1,
+        .gso_segment =  tcp_tso_segment,
-        .netns_ok       =       1,
+        .gro_receive =  tcp4_gro_receive,
-};
+        .gro_complete = tcp4_gro_complete,
+        .no_policy =    1,
-static const struct net_offload tcp_offload = {
+        .netns_ok =     1,
-        .callbacks = {
-                .gso_send_check =       tcp_v4_gso_send_check,
-                .gso_segment    =       tcp_tso_segment,
-                .gro_receive    =       tcp4_gro_receive,
-                .gro_complete   =       tcp4_gro_complete,
-        },
 };
 static const struct net_protocol udp_protocol = {
        .handler =      udp_rcv,
        .err_handler =  udp_err,
+        .gso_send_check = udp4_ufo_send_check,
+        .gso_segment = udp4_ufo_fragment,
        .no_policy =    1,
        .netns_ok =     1,
 };
-static const struct net_offload udp_offload = {
-        .callbacks = {
-                .gso_send_check = udp4_ufo_send_check,
-                .gso_segment = udp4_ufo_fragment,
-        },
-};
 static const struct net_protocol icmp_protocol = {
        .handler =      icmp_rcv,
        .err_handler =  ping_err,
@@ -1621,9 +1590,9 @@ static __net_init int ipv4_mib_init_net(struct net *net)
                          sizeof(struct icmp_mib),
                          __alignof__(struct icmp_mib)) < 0)
                goto err_icmp_mib;
-        net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+        if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
-                                              GFP_KERNEL);
+                          sizeof(struct icmpmsg_mib),
-        if (!net->mib.icmpmsg_statistics)
+                          __alignof__(struct icmpmsg_mib)) < 0)
                goto err_icmpmsg_mib;
        tcp_mib_init(net);
@@ -1647,7 +1616,7 @@ err_tcp_mib:
 static __net_exit void ipv4_mib_exit_net(struct net *net)
 {
-        kfree(net->mib.icmpmsg_statistics);
+        snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
        snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
        snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
        snmp_mib_free((void __percpu **)net->mib.udp_statistics);
@@ -1672,35 +1641,13 @@ static int ipv4_proc_init(void);
 *      IP protocol layer initialiser
 */
-static struct packet_offload ip_packet_offload __read_mostly = {
-        .type = cpu_to_be16(ETH_P_IP),
-        .callbacks = {
-                .gso_send_check = inet_gso_send_check,
-                .gso_segment = inet_gso_segment,
-                .gro_receive = inet_gro_receive,
-                .gro_complete = inet_gro_complete,
-        },
-};
-static int __init ipv4_offload_init(void)
-{
-        /*
-         * Add offloads
-         */
-        if (inet_add_offload(&udp_offload, IPPROTO_UDP) < 0)
-                pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
-        if (inet_add_offload(&tcp_offload, IPPROTO_TCP) < 0)
-                pr_crit("%s: Cannot add TCP protocol offlaod\n", __func__);
-        dev_add_offload(&ip_packet_offload);
-        return 0;
-}
-fs_initcall(ipv4_offload_init);
 static struct packet_type ip_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_IP),
        .func = ip_rcv,
+        .gso_send_check = inet_gso_send_check,
+        .gso_segment = inet_gso_segment,
+        .gro_receive = inet_gro_receive,
+        .gro_complete = inet_gro_complete,
 };
 static int __init inet_init(void)
@@ -1742,21 +1689,19 @@ static int __init inet_init(void)
        ip_static_sysctl_init();
 #endif
-        tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
        /*
         *      Add all the base protocols.
         */
        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
-                pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+                printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
        if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
-                pr_crit("%s: Cannot add UDP protocol\n", __func__);
+                printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
        if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
-                pr_crit("%s: Cannot add TCP protocol\n", __func__);
+                printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
 #ifdef CONFIG_IP_MULTICAST
        if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
-                pr_crit("%s: Cannot add IGMP protocol\n", __func__);
+                printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
 #endif
        /* Register the socket-side information for inet_create. */
@@ -1803,14 +1748,14 @@ static int __init inet_init(void)
         */
 #if defined(CONFIG_IP_MROUTE)
        if (ip_mr_init())
-                pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
+                printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
 #endif
        /*
         *      Initialise per-cpu ipv4 mibs
         */
        if (init_ipv4_mibs())
-                pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
+                printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
        ipv4_proc_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a0d8392491c..36d14406261 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,5 +1,3 @@
-#define pr_fmt(fmt) "IPsec: " fmt
 #include <crypto/hash.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -77,7 +75,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
 static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
 {
-        unsigned char *optptr = (unsigned char *)(iph+1);
+        unsigned char * optptr = (unsigned char*)(iph+1);
        int  l = iph->ihl*4 - sizeof(struct iphdr);
        int  optlen;
@@ -398,25 +396,16 @@ static void ah4_err(struct sk_buff *skb, u32 info)
        struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
-        switch (icmp_hdr(skb)->type) {
+        if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
-        case ICMP_DEST_UNREACH:
+            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                        return;
-        case ICMP_REDIRECT:
-                break;
-        default:
                return;
-        }
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              ah->spi, IPPROTO_AH, AF_INET);
        if (!x)
                return;
+        printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
-        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+               ntohl(ah->spi), ntohl(iph->daddr));
-                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
-        else
-                ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
        xfrm_state_put(x);
 }
@@ -456,10 +445,9 @@ static int ah_init_state(struct xfrm_state *x)
        if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
            crypto_ahash_digestsize(ahash)) {
-                pr_info("%s: %s digestsize %u != %hu\n",
+                printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
-                        __func__, x->aalg->alg_name,
+                       x->aalg->alg_name, crypto_ahash_digestsize(ahash),
-                        crypto_ahash_digestsize(ahash),
+                       aalg_desc->uinfo.auth.icv_fullbits/8);
-                        aalg_desc->uinfo.auth.icv_fullbits / 8);
                goto error;
        }
@@ -522,11 +510,11 @@ static const struct net_protocol ah4_protocol = {
 static int __init ah4_init(void)
 {
        if (xfrm_register_type(&ah_type, AF_INET) < 0) {
-                pr_info("%s: can't add xfrm type\n", __func__);
+                printk(KERN_INFO "ip ah init: can't add xfrm type\n");
                return -EAGAIN;
        }
        if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
-                pr_info("%s: can't add protocol\n", __func__);
+                printk(KERN_INFO "ip ah init: can't add protocol\n");
                xfrm_unregister_type(&ah_type, AF_INET);
                return -EAGAIN;
        }
@@ -536,9 +524,9 @@ static int __init ah4_init(void)
 static void __exit ah4_fini(void)
 {
        if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
-                pr_info("%s: can't remove protocol\n", __func__);
+                printk(KERN_INFO "ip ah close: can't remove protocol\n");
        if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
-                pr_info("%s: can't remove xfrm type\n", __func__);
+                printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
 }
 module_init(ah4_init);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 9547a273b9e..96a164aa136 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -73,8 +73,6 @@
 *              Jesper D. Brouer:       Proxy ARP PVLAN RFC 3069 support.
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/string.h>
@@ -91,6 +89,7 @@
 #include <linux/etherdevice.h>
 #include <linux/fddidevice.h>
 #include <linux/if_arp.h>
+#include <linux/trdevice.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -113,7 +112,13 @@
 #include <net/arp.h>
 #include <net/ax25.h>
 #include <net/netrom.h>
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+#include <net/atmclip.h>
+struct neigh_table *clip_tbl_hook;
+EXPORT_SYMBOL(clip_tbl_hook);
+#endif
+#include <asm/system.h>
 #include <linux/uaccess.h>
 #include <linux/netfilter_arp.h>
@@ -121,7 +126,7 @@
 /*
 *      Interface to generic neighbour cache.
 */
-static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
 static int arp_constructor(struct neighbour *neigh);
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
 static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -159,6 +164,7 @@ static const struct neigh_ops arp_broken_ops = {
 struct neigh_table arp_tbl = {
        .family         = AF_INET,
+        .entry_size     = sizeof(struct neighbour) + 4,
        .key_len        = 4,
        .hash           = arp_hash,
        .constructor    = arp_constructor,
@@ -171,7 +177,7 @@ struct neigh_table arp_tbl = {
                .gc_staletime           = 60 * HZ,
                .reachable_time         = 30 * HZ,
                .delay_probe_time       = 5 * HZ,
-                .queue_len_bytes        = 64*1024,
+                .queue_len              = 3,
                .ucast_probes           = 3,
                .mcast_probes           = 3,
                .anycast_delay          = 1 * HZ,
@@ -194,6 +200,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
        case ARPHRD_IEEE802:
                ip_eth_mc_map(addr, haddr);
                return 0;
+        case ARPHRD_IEEE802_TR:
+                ip_tr_mc_map(addr, haddr);
+                return 0;
        case ARPHRD_INFINIBAND:
                ip_ib_mc_map(addr, dev->broadcast, haddr);
                return 0;
@@ -212,9 +221,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 static u32 arp_hash(const void *pkey,
                    const struct net_device *dev,
-                    __u32 *hash_rnd)
+                    __u32 hash_rnd)
 {
-        return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
+        return arp_hashfn(*(u32 *)pkey, dev, hash_rnd);
 }
 static int arp_constructor(struct neighbour *neigh)
@@ -274,9 +283,9 @@ static int arp_constructor(struct neighbour *neigh)
                default:
                        break;
                case ARPHRD_ROSE:
-#if IS_ENABLED(CONFIG_AX25)
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
                case ARPHRD_AX25:
-#if IS_ENABLED(CONFIG_NETROM)
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
                case ARPHRD_NETROM:
 #endif
                        neigh->ops = &arp_broken_ops;
@@ -321,7 +330,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 {
        __be32 saddr = 0;
-        u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+        u8  *dst_ha = NULL;
        struct net_device *dev = neigh->dev;
        __be32 target = *(__be32 *)neigh->primary_key;
        int probes = atomic_read(&neigh->probes);
@@ -362,9 +371,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
        probes -= neigh->parms->ucast_probes;
        if (probes < 0) {
                if (!(neigh->nud_state & NUD_VALID))
-                        pr_debug("trying to ucast probe in NUD_INVALID\n");
+                        printk(KERN_DEBUG
-                neigh_ha_snapshot(dst_ha, neigh, dev);
+                               "trying to ucast probe in NUD_INVALID\n");
-                dst_hw = dst_ha;
+                dst_ha = neigh->ha;
+                read_lock_bh(&neigh->lock);
        } else {
                probes -= neigh->parms->app_probes;
                if (probes < 0) {
@@ -376,7 +386,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
        }
        arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
-                 dst_hw, dev->dev_addr, NULL);
+                 dst_ha, dev->dev_addr, NULL);
+        if (dst_ha)
+                read_unlock_bh(&neigh->lock);
 }
 static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -447,7 +459,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,
 {
        switch (addr_hint) {
        case RTN_LOCAL:
-                pr_debug("arp called for own IP address\n");
+                printk(KERN_DEBUG "ARP: arp called for own IP address\n");
                memcpy(haddr, dev->dev_addr, dev->addr_len);
                return 1;
        case RTN_MULTICAST:
@@ -468,12 +480,13 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
        struct neighbour *n;
        if (!skb_dst(skb)) {
-                pr_debug("arp_find is called with dst==NULL\n");
+                printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
                kfree_skb(skb);
                return 1;
        }
-        paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
+        paddr = skb_rtable(skb)->rt_gateway;
        if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
                               paddr, dev))
                return 0;
@@ -579,18 +592,16 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
        struct sk_buff *skb;
        struct arphdr *arp;
        unsigned char *arp_ptr;
-        int hlen = LL_RESERVED_SPACE(dev);
-        int tlen = dev->needed_tailroom;
        /*
         *      Allocate a buffer
         */
-        skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
+        skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
        if (skb == NULL)
                return NULL;
-        skb_reserve(skb, hlen);
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
        skb_reset_network_header(skb);
        arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
        skb->dev = dev;
@@ -622,13 +633,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
                arp->ar_pro = htons(ETH_P_IP);
                break;
-#if IS_ENABLED(CONFIG_AX25)
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
        case ARPHRD_AX25:
                arp->ar_hrd = htons(ARPHRD_AX25);
                arp->ar_pro = htons(AX25_P_IP);
                break;
-#if IS_ENABLED(CONFIG_NETROM)
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
        case ARPHRD_NETROM:
                arp->ar_hrd = htons(ARPHRD_NETROM);
                arp->ar_pro = htons(AX25_P_IP);
@@ -636,12 +647,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 #endif
 #endif
-#if IS_ENABLED(CONFIG_FDDI)
+#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
        case ARPHRD_FDDI:
                arp->ar_hrd = htons(ARPHRD_ETHER);
                arp->ar_pro = htons(ETH_P_IP);
                break;
 #endif
+#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
+        case ARPHRD_IEEE802_TR:
+                arp->ar_hrd = htons(ARPHRD_IEEE802);
+                arp->ar_pro = htons(ETH_P_IP);
+                break;
+#endif
        }
        arp->ar_hln = dev->addr_len;
@@ -739,10 +756,11 @@ static int arp_process(struct sk_buff *skb)
                        goto out;
                break;
        case ARPHRD_ETHER:
+        case ARPHRD_IEEE802_TR:
        case ARPHRD_FDDI:
        case ARPHRD_IEEE802:
                /*
-                 * ETHERNET, and Fibre Channel (which are IEEE 802
+                 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
                 * devices, according to RFC 2625) devices will accept ARP
                 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
                 * This is the case also of FDDI, where the RFC 1390 says that
@@ -787,8 +805,7 @@ static int arp_process(struct sk_buff *skb)
 *      Check for bad requests for 127.x.x.x and requests for multicast
 *      addresses.  If this is one such, delete it.
 */
-        if (ipv4_is_multicast(tip) ||
+        if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
-            (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
                goto out;
 /*
@@ -850,8 +867,7 @@ static int arp_process(struct sk_buff *skb)
                        if (addr_type == RTN_UNICAST  &&
                            (arp_fwd_proxy(in_dev, dev, rt) ||
                             arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
-                             (rt->dst.dev != dev &&
+                             pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
-                              pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n)
                                        neigh_release(n);
@@ -876,7 +892,7 @@ static int arp_process(struct sk_buff *skb)
        n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
-        if (IN_DEV_ARP_ACCEPT(in_dev)) {
+        if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) {
                /* Unsolicited ARP is not accepted by default.
                   It is possible, that this option should be enabled for some
                   devices (strip is candidate)
@@ -1024,7 +1040,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
                        return -EINVAL;
        }
        switch (dev->type) {
-#if IS_ENABLED(CONFIG_FDDI)
+#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
        case ARPHRD_FDDI:
                /*
                 * According to RFC 1390, FDDI devices should accept ARP
@@ -1047,7 +1063,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
        neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
        err = PTR_ERR(neigh);
        if (!IS_ERR(neigh)) {
-                unsigned int state = NUD_STALE;
+                unsigned state = NUD_STALE;
                if (r->arp_flags & ATF_PERM)
                        state = NUD_PERMANENT;
                err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
@@ -1059,7 +1075,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
        return err;
 }
-static unsigned int arp_state_to_flags(struct neighbour *neigh)
+static unsigned arp_state_to_flags(struct neighbour *neigh)
 {
        if (neigh->nud_state&NUD_PERMANENT)
                return ATF_PERM | ATF_COM;
@@ -1159,7 +1175,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        switch (cmd) {
        case SIOCDARP:
        case SIOCSARP:
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        return -EPERM;
        case SIOCGARP:
                err = copy_from_user(&r, arg, sizeof(struct arpreq));
@@ -1223,7 +1239,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
        switch (event) {
        case NETDEV_CHANGEADDR:
                neigh_changeaddr(&arp_tbl, dev);
-                rt_cache_flush(dev_net(dev));
+                rt_cache_flush(dev_net(dev), 0);
                break;
        default:
                break;
@@ -1270,7 +1286,7 @@ void __init arp_init(void)
 }
 #ifdef CONFIG_PROC_FS
-#if IS_ENABLED(CONFIG_AX25)
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
 /* ------------------------------------------------------------------------ */
 /*
@@ -1318,7 +1334,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
        read_lock(&n->lock);
        /* Convert hardware address to XX:XX:XX:XX ... form. */
-#if IS_ENABLED(CONFIG_AX25)
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
        if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
                ax2asc2((ax25_address *)n->ha, hbuffer);
        else {
@@ -1331,7 +1347,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
        if (k != 0)
                --k;
        hbuffer[k] = 0;
-#if IS_ENABLED(CONFIG_AX25)
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
        }
 #endif
        sprintf(tbuf, "%pI4", n->primary_key);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 667c1d4ca98..2c2a98e402e 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -476,7 +476,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
        doi = doi_def->doi;
        doi_type = doi_def->type;
-        if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+        if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
                goto doi_add_return;
        for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
                switch (doi_def->tags[iter]) {
@@ -1725,10 +1725,8 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
                case CIPSO_V4_TAG_LOCAL:
                        /* This is a non-standard tag that we only allow for
                         * local connections, so if the incoming interface is
-                         * not the loopback device drop the packet. Further,
+                         * not the loopback device drop the packet. */
-                         * there is no legitimate reason for setting this from
+                        if (!(skb->dev->flags & IFF_LOOPBACK)) {
-                         * userspace so reject it if skb is NULL. */
-                        if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
                                err_offset = opt_iter;
                                goto validate_return_locked;
                        }
@@ -1859,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
        return CIPSO_V4_HDR_LEN + ret_val;
 }
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+        kfree(container_of(head, struct ip_options_rcu, rcu));
+}
 /**
 * cipso_v4_sock_setattr - Add a CIPSO option to a socket
 * @sk: the socket
@@ -1935,7 +1938,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
        }
        rcu_assign_pointer(sk_inet->inet_opt, opt);
        if (old)
-                kfree_rcu(old, rcu);
+                call_rcu(&old->rcu, opt_kfree_rcu);
        return 0;
@@ -2002,7 +2005,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
        req_inet = inet_rsk(req);
        opt = xchg(&req_inet->opt, opt);
        if (opt)
-                kfree_rcu(opt, rcu);
+                call_rcu(&opt->rcu, opt_kfree_rcu);
        return 0;
@@ -2072,7 +2075,7 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
                 * remove the entire option struct */
                *opt_ptr = NULL;
                hdr_delta = opt->opt.optlen;
-                kfree_rcu(opt, rcu);
+                call_rcu(&opt->rcu, opt_kfree_rcu);
        }
        return hdr_delta;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a8e4f2665d5..76db59202f1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -27,6 +27,7 @@
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/module.h>
@@ -55,10 +56,10 @@
 #include <linux/sysctl.h>
 #endif
 #include <linux/kmod.h>
-#include <linux/netconf.h>
 #include <net/arp.h>
 #include <net/ip.h>
+#include <net/tcp.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
 #include <net/rtnetlink.h>
@@ -95,22 +96,25 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
        [IFA_LABEL]             = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 };
-#define IN4_ADDR_HSIZE_SHIFT    8
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
-#define IN4_ADDR_HSIZE          (1U << IN4_ADDR_HSIZE_SHIFT)
+ * value.  So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE  256
 static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
 static DEFINE_SPINLOCK(inet_addr_hash_lock);
-static u32 inet_addr_hash(struct net *net, __be32 addr)
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
 {
-        u32 val = (__force u32) addr ^ net_hash_mix(net);
+        u32 val = (__force u32) addr ^ hash_ptr(net, 8);
-        return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+        return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+                (IN4_ADDR_HSIZE - 1));
 }
 static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
 {
-        u32 hash = inet_addr_hash(net, ifa->ifa_local);
+        unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
        spin_lock(&inet_addr_hash_lock);
        hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
@@ -134,18 +138,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
 */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-        u32 hash = inet_addr_hash(net, addr);
+        unsigned int hash = inet_addr_hash(net, addr);
        struct net_device *result = NULL;
        struct in_ifaddr *ifa;
        struct hlist_node *node;
        rcu_read_lock();
        hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
-                if (ifa->ifa_local == addr) {
+                struct net_device *dev = ifa->ifa_dev->dev;
-                        struct net_device *dev = ifa->ifa_dev->dev;
-                        if (!net_eq(dev_net(dev), net))
+                if (!net_eq(dev_net(dev), net))
-                                continue;
+                        continue;
+                if (ifa->ifa_local == addr) {
                        result = dev;
                        break;
                }
@@ -180,10 +184,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 static void devinet_sysctl_register(struct in_device *idev);
 static void devinet_sysctl_unregister(struct in_device *idev);
 #else
-static void devinet_sysctl_register(struct in_device *idev)
+static inline void devinet_sysctl_register(struct in_device *idev)
 {
 }
-static void devinet_sysctl_unregister(struct in_device *idev)
+static inline void devinet_sysctl_unregister(struct in_device *idev)
 {
 }
 #endif
@@ -203,7 +207,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
        kfree(ifa);
 }
-static void inet_free_ifa(struct in_ifaddr *ifa)
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
 {
        call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
 }
@@ -215,7 +219,8 @@ void in_dev_finish_destroy(struct in_device *idev)
        WARN_ON(idev->ifa_list);
        WARN_ON(idev->mc_list);
 #ifdef NET_REFCNT_DEBUG
-        pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
+        printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+               idev, dev ? dev->name : "NIL");
 #endif
        dev_put(dev);
        if (!idev->dead)
@@ -287,7 +292,7 @@ static void inetdev_destroy(struct in_device *in_dev)
                inet_free_ifa(ifa);
        }
-        RCU_INIT_POINTER(dev->ip_ptr, NULL);
+        rcu_assign_pointer(dev->ip_ptr, NULL);
        devinet_sysctl_unregister(in_dev);
        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -312,7 +317,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
 }
 static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
-                         int destroy, struct nlmsghdr *nlh, u32 portid)
+                         int destroy, struct nlmsghdr *nlh, u32 pid)
 {
        struct in_ifaddr *promote = NULL;
        struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -346,7 +351,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                                inet_hash_remove(ifa);
                                *ifap1 = ifa->ifa_next;
-                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
+                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
                                blocking_notifier_call_chain(&inetaddr_chain,
                                                NETDEV_DOWN, ifa);
                                inet_free_ifa(ifa);
@@ -383,7 +388,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
           is valid, it will try to restore deleted routes... Grr.
           So that, this order is correct.
         */
-        rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
+        rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
        if (promote) {
@@ -396,7 +401,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
                }
                promote->ifa_flags &= ~IFA_F_SECONDARY;
-                rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
+                rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
                blocking_notifier_call_chain(&inetaddr_chain,
                                NETDEV_UP, promote);
                for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
@@ -418,7 +423,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 }
 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-                             u32 portid)
+                             u32 pid)
 {
        struct in_device *in_dev = ifa->ifa_dev;
        struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -465,7 +470,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
        /* Send message first, then call notifier.
           Notifier will trigger FIB update, so that
           listeners of netlink will know about new ifaddr */
-        rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
+        rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
        return 0;
@@ -564,7 +569,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
                    !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
                        continue;
-                __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
+                __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
                return 0;
        }
@@ -650,14 +655,14 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
        if (IS_ERR(ifa))
                return PTR_ERR(ifa);
-        return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+        return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
 }
 /*
 *      Determine a default network mask, based on the IP address.
 */
-static int inet_abc_len(__be32 addr)
+static inline int inet_abc_len(__be32 addr)
 {
        int rc = -1;    /* Something else, probably a multicast. */
@@ -723,16 +728,17 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                break;
        case SIOCSIFFLAGS:
-                ret = -EPERM;
+                ret = -EACCES;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        goto out;
                break;
        case SIOCSIFADDR:       /* Set interface address (and family) */
        case SIOCSIFBRDADDR:    /* Set the broadcast address */
        case SIOCSIFDSTADDR:    /* Set the destination address */
        case SIOCSIFNETMASK:    /* Set the netmask for the interface */
-                ret = -EPERM;
+        case SIOCKILLADDR:      /* Nuke all sockets on this address */
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                ret = -EACCES;
+                if (!capable(CAP_NET_ADMIN))
                        goto out;
                ret = -EINVAL;
                if (sin->sin_family != AF_INET)
@@ -782,7 +788,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        }
        ret = -EADDRNOTAVAIL;
-        if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+        if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS
+            && cmd != SIOCKILLADDR)
                goto done;
        switch (cmd) {
@@ -823,9 +830,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                if (!ifa) {
                        ret = -ENOBUFS;
                        ifa = inet_alloc_ifa();
+                        INIT_HLIST_NODE(&ifa->hash);
                        if (!ifa)
                                break;
-                        INIT_HLIST_NODE(&ifa->hash);
                        if (colon)
                                memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
                        else
@@ -908,6 +915,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
                        inet_insert_ifa(ifa);
                }
                break;
+        case SIOCKILLADDR:      /* Nuke all connections on this address */
+                ret = tcp_nuke_addr(net, (struct sockaddr *) sin);
+                break;
        }
 done:
        rtnl_unlock();
@@ -1075,7 +1085,6 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
        return addr;
 }
-EXPORT_SYMBOL(inet_confirm_addr);
 /*
 *      Device notifier
@@ -1122,7 +1131,7 @@ skip:
        }
 }
-static bool inetdev_valid_mtu(unsigned int mtu)
+static inline bool inetdev_valid_mtu(unsigned mtu)
 {
        return mtu >= 68;
 }
@@ -1171,8 +1180,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
        switch (event) {
        case NETDEV_REGISTER:
-                pr_debug("%s: bug\n", __func__);
+                printk(KERN_DEBUG "inetdev_event: bug\n");
-                RCU_INIT_POINTER(dev->ip_ptr, NULL);
+                rcu_assign_pointer(dev->ip_ptr, NULL);
                break;
        case NETDEV_UP:
                if (!inetdev_valid_mtu(dev->mtu))
@@ -1237,7 +1246,7 @@ static struct notifier_block ip_netdev_notifier = {
        .notifier_call = inetdev_event,
 };
-static size_t inet_nlmsg_size(void)
+static inline size_t inet_nlmsg_size(void)
 {
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
               + nla_total_size(4) /* IFA_ADDRESS */
@@ -1247,12 +1256,12 @@ static size_t inet_nlmsg_size(void)
 }
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-                            u32 portid, u32 seq, int event, unsigned int flags)
+                            u32 pid, u32 seq, int event, unsigned int flags)
 {
        struct ifaddrmsg *ifm;
        struct nlmsghdr  *nlh;
-        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -1263,15 +1272,17 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
        ifm->ifa_scope = ifa->ifa_scope;
        ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
-        if ((ifa->ifa_address &&
+        if (ifa->ifa_address)
-             nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+                NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
-            (ifa->ifa_local &&
-             nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) ||
+        if (ifa->ifa_local)
-            (ifa->ifa_broadcast &&
+                NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
-             nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
-            (ifa->ifa_label[0] &&
+        if (ifa->ifa_broadcast)
-             nla_put_string(skb, IFA_LABEL, ifa->ifa_label)))
+                NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
-                goto nla_put_failure;
+        if (ifa->ifa_label[0])
+                NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
        return nlmsg_end(skb, nlh);
@@ -1314,7 +1325,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
                                if (ip_idx < s_ip_idx)
                                        continue;
                                if (inet_fill_ifaddr(skb, ifa,
-                                             NETLINK_CB(cb->skb).portid,
+                                             NETLINK_CB(cb->skb).pid,
                                             cb->nlh->nlmsg_seq,
                                             RTM_NEWADDR, NLM_F_MULTI) <= 0) {
                                        rcu_read_unlock();
@@ -1336,7 +1347,7 @@ done:
 }
 static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
-                      u32 portid)
+                      u32 pid)
 {
        struct sk_buff *skb;
        u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1348,14 +1359,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
        if (skb == NULL)
                goto errout;
-        err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+        err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
-        rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+        rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
        return;
 errout:
        if (err < 0)
@@ -1443,155 +1454,6 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
        return 0;
 }
-static int inet_netconf_msgsize_devconf(int type)
-{
-        int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
-                   + nla_total_size(4); /* NETCONFA_IFINDEX */
-        /* type -1 is used for ALL */
-        if (type == -1 || type == NETCONFA_FORWARDING)
-                size += nla_total_size(4);
-        if (type == -1 || type == NETCONFA_RP_FILTER)
-                size += nla_total_size(4);
-        if (type == -1 || type == NETCONFA_MC_FORWARDING)
-                size += nla_total_size(4);
-        return size;
-}
-static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
-                                     struct ipv4_devconf *devconf, u32 portid,
-                                     u32 seq, int event, unsigned int flags,
-                                     int type)
-{
-        struct nlmsghdr  *nlh;
-        struct netconfmsg *ncm;
-        nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
-                        flags);
-        if (nlh == NULL)
-                return -EMSGSIZE;
-        ncm = nlmsg_data(nlh);
-        ncm->ncm_family = AF_INET;
-        if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
-                goto nla_put_failure;
-        /* type -1 is used for ALL */
-        if ((type == -1 || type == NETCONFA_FORWARDING) &&
-            nla_put_s32(skb, NETCONFA_FORWARDING,
-                        IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
-                goto nla_put_failure;
-        if ((type == -1 || type == NETCONFA_RP_FILTER) &&
-            nla_put_s32(skb, NETCONFA_RP_FILTER,
-                        IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
-                goto nla_put_failure;
-        if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
-            nla_put_s32(skb, NETCONFA_MC_FORWARDING,
-                        IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
-                goto nla_put_failure;
-        return nlmsg_end(skb, nlh);
-nla_put_failure:
-        nlmsg_cancel(skb, nlh);
-        return -EMSGSIZE;
-}
-void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
-                                 struct ipv4_devconf *devconf)
-{
-        struct sk_buff *skb;
-        int err = -ENOBUFS;
-        skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
-        if (skb == NULL)
-                goto errout;
-        err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
-                                        RTM_NEWNETCONF, 0, type);
-        if (err < 0) {
-                /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
-                WARN_ON(err == -EMSGSIZE);
-                kfree_skb(skb);
-                goto errout;
-        }
-        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
-        return;
-errout:
-        if (err < 0)
-                rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
-}
-static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
-        [NETCONFA_IFINDEX]      = { .len = sizeof(int) },
-        [NETCONFA_FORWARDING]   = { .len = sizeof(int) },
-        [NETCONFA_RP_FILTER]    = { .len = sizeof(int) },
-};
-static int inet_netconf_get_devconf(struct sk_buff *in_skb,
-                                    struct nlmsghdr *nlh,
-                                    void *arg)
-{
-        struct net *net = sock_net(in_skb->sk);
-        struct nlattr *tb[NETCONFA_MAX+1];
-        struct netconfmsg *ncm;
-        struct sk_buff *skb;
-        struct ipv4_devconf *devconf;
-        struct in_device *in_dev;
-        struct net_device *dev;
-        int ifindex;
-        int err;
-        err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
-                          devconf_ipv4_policy);
-        if (err < 0)
-                goto errout;
-        err = EINVAL;
-        if (!tb[NETCONFA_IFINDEX])
-                goto errout;
-        ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
-        switch (ifindex) {
-        case NETCONFA_IFINDEX_ALL:
-                devconf = net->ipv4.devconf_all;
-                break;
-        case NETCONFA_IFINDEX_DEFAULT:
-                devconf = net->ipv4.devconf_dflt;
-                break;
-        default:
-                dev = __dev_get_by_index(net, ifindex);
-                if (dev == NULL)
-                        goto errout;
-                in_dev = __in_dev_get_rtnl(dev);
-                if (in_dev == NULL)
-                        goto errout;
-                devconf = &in_dev->cnf;
-                break;
-        }
-        err = -ENOBUFS;
-        skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
-        if (skb == NULL)
-                goto errout;
-        err = inet_netconf_fill_devconf(skb, ifindex, devconf,
-                                        NETLINK_CB(in_skb).portid,
-                                        nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-                                        -1);
-        if (err < 0) {
-                /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
-                WARN_ON(err == -EMSGSIZE);
-                kfree_skb(skb);
-                goto errout;
-        }
-        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-        return err;
-}
 #ifdef CONFIG_SYSCTL
 static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1617,12 +1479,6 @@ static void inet_forward_change(struct net *net)
        IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
        IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
-        inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
-                                    NETCONFA_IFINDEX_ALL,
-                                    net->ipv4.devconf_all);
-        inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
-                                    NETCONFA_IFINDEX_DEFAULT,
-                                    net->ipv4.devconf_dflt);
        for_each_netdev(net, dev) {
                struct in_device *in_dev;
@@ -1630,11 +1486,8 @@ static void inet_forward_change(struct net *net)
                        dev_disable_lro(dev);
                rcu_read_lock();
                in_dev = __in_dev_get_rcu(dev);
-                if (in_dev) {
+                if (in_dev)
                        IN_DEV_CONF_SET(in_dev, FORWARDING, on);
-                        inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
-                                                    dev->ifindex, &in_dev->cnf);
-                }
                rcu_read_unlock();
        }
 }
@@ -1656,27 +1509,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
                if (cnf == net->ipv4.devconf_dflt)
                        devinet_copy_dflt_conf(net, i);
-                if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+                if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
-                    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
                        if ((new_value == 0) && (old_value != 0))
-                                rt_cache_flush(net);
+                                rt_cache_flush(net, 0);
-                if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
-                    new_value != old_value) {
-                        int ifindex;
-                        if (cnf == net->ipv4.devconf_dflt)
-                                ifindex = NETCONFA_IFINDEX_DEFAULT;
-                        else if (cnf == net->ipv4.devconf_all)
-                                ifindex = NETCONFA_IFINDEX_ALL;
-                        else {
-                                struct in_device *idev =
-                                        container_of(cnf, struct in_device,
-                                                     cnf);
-                                ifindex = idev->dev->ifindex;
-                        }
-                        inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
-                                                    ifindex, cnf);
-                }
        }
        return ret;
@@ -1703,23 +1538,15 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
                        }
                        if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
                                inet_forward_change(net);
-                        } else {
+                        } else if (*valp) {
                                struct ipv4_devconf *cnf = ctl->extra1;
                                struct in_device *idev =
                                        container_of(cnf, struct in_device, cnf);
-                                if (*valp)
+                                dev_disable_lro(idev->dev);
-                                        dev_disable_lro(idev->dev);
-                                inet_netconf_notify_devconf(net,
-                                                            NETCONFA_FORWARDING,
-                                                            idev->dev->ifindex,
-                                                            cnf);
                        }
                        rtnl_unlock();
-                        rt_cache_flush(net);
+                        rt_cache_flush(net, 0);
-                } else
+                }
-                        inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
-                                                    NETCONFA_IFINDEX_DEFAULT,
-                                                    net->ipv4.devconf_dflt);
        }
        return ret;
@@ -1735,7 +1562,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
        struct net *net = ctl->extra2;
        if (write && *valp != val)
-                rt_cache_flush(net);
+                rt_cache_flush(net, 0);
        return ret;
 }
@@ -1766,6 +1593,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
 static struct devinet_sysctl_table {
        struct ctl_table_header *sysctl_header;
        struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+        char *dev_name;
 } devinet_sysctl = {
        .devinet_vars = {
                DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1799,8 +1627,6 @@ static struct devinet_sysctl_table {
                                              "force_igmp_version"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
                                              "promote_secondaries"),
-                DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
-                                              "route_localnet"),
        },
 };
@@ -1809,7 +1635,16 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
 {
        int i;
        struct devinet_sysctl_table *t;
-        char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
+#define DEVINET_CTL_PATH_DEV    3
+        struct ctl_path devinet_ctl_path[] = {
+                { .procname = "net",  },
+                { .procname = "ipv4", },
+                { .procname = "conf", },
+                { /* to be set */ },
+                { },
+        };
        t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
        if (!t)
@@ -1821,15 +1656,27 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
                t->devinet_vars[i].extra2 = net;
        }
-        snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
+        /*
+         * Make a copy of dev_name, because '.procname' is regarded as const
+         * by sysctl and we wouldn't want anyone to change it under our feet
+         * (see SIOCSIFNAME).
+         */
+        t->dev_name = kstrdup(dev_name, GFP_KERNEL);
+        if (!t->dev_name)
+                goto free;
+        devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
-        t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
+        t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
+                        t->devinet_vars);
        if (!t->sysctl_header)
-                goto free;
+                goto free_procname;
        p->sysctl = t;
        return 0;
+free_procname:
+        kfree(t->dev_name);
 free:
        kfree(t);
 out:
@@ -1845,6 +1692,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
        cnf->sysctl = NULL;
        unregister_net_sysctl_table(t->sysctl_header);
+        kfree(t->dev_name);
        kfree(t);
 }
@@ -1874,6 +1722,12 @@ static struct ctl_table ctl_forward_entry[] = {
        },
        { },
 };
+static __net_initdata struct ctl_path net_ipv4_path[] = {
+        { .procname = "net", },
+        { .procname = "ipv4", },
+        { },
+};
 #endif
 static __net_init int devinet_init_net(struct net *net)
@@ -1919,7 +1773,7 @@ static __net_init int devinet_init_net(struct net *net)
                goto err_reg_dflt;
        err = -ENOMEM;
-        forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
+        forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
        if (forw_hdr == NULL)
                goto err_reg_ctl;
        net->ipv4.forw_hdr = forw_hdr;
@@ -1993,7 +1847,5 @@ void __init devinet_init(void)
        rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
        rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
        rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
-        rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
-                      NULL, NULL);
 }
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b61e9deb7c7..a5b413416da 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,5 +1,3 @@
-#define pr_fmt(fmt) "IPsec: " fmt
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
 #include <linux/err.h>
@@ -459,22 +457,28 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
        struct esp_data *esp = x->data;
        u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
        u32 align = max_t(u32, blksize, esp->padlen);
-        unsigned int net_adj;
+        u32 rem;
+        mtu -= x->props.header_len + crypto_aead_authsize(esp->aead);
+        rem = mtu & (align - 1);
+        mtu &= ~(align - 1);
        switch (x->props.mode) {
-        case XFRM_MODE_TRANSPORT:
-        case XFRM_MODE_BEET:
-                net_adj = sizeof(struct iphdr);
-                break;
        case XFRM_MODE_TUNNEL:
-                net_adj = 0;
                break;
        default:
-                BUG();
+        case XFRM_MODE_TRANSPORT:
+                /* The worst case */
+                mtu -= blksize - 4;
+                mtu += min_t(u32, blksize - 4, rem);
+                break;
+        case XFRM_MODE_BEET:
+                /* The worst case. */
+                mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
+                break;
        }
-        return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+        return mtu - 2;
-                 net_adj) & ~(align - 1)) + (net_adj - 2);
 }
 static void esp4_err(struct sk_buff *skb, u32 info)
@@ -484,25 +488,16 @@ static void esp4_err(struct sk_buff *skb, u32 info)
        struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
-        switch (icmp_hdr(skb)->type) {
+        if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
-        case ICMP_DEST_UNREACH:
+            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                        return;
-        case ICMP_REDIRECT:
-                break;
-        default:
                return;
-        }
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              esph->spi, IPPROTO_ESP, AF_INET);
        if (!x)
                return;
+        NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
-        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+                 ntohl(esph->spi), ntohl(iph->daddr));
-                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
-        else
-                ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
        xfrm_state_put(x);
 }
@@ -711,11 +706,11 @@ static const struct net_protocol esp4_protocol = {
 static int __init esp4_init(void)
 {
        if (xfrm_register_type(&esp_type, AF_INET) < 0) {
-                pr_info("%s: can't add xfrm type\n", __func__);
+                printk(KERN_INFO "ip esp init: can't add xfrm type\n");
                return -EAGAIN;
        }
        if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
-                pr_info("%s: can't add protocol\n", __func__);
+                printk(KERN_INFO "ip esp init: can't add protocol\n");
                xfrm_unregister_type(&esp_type, AF_INET);
                return -EAGAIN;
        }
@@ -725,9 +720,9 @@ static int __init esp4_init(void)
 static void __exit esp4_fini(void)
 {
        if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
-                pr_info("%s: can't remove protocol\n", __func__);
+                printk(KERN_INFO "ip esp close: can't remove protocol\n");
        if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
-                pr_info("%s: can't remove xfrm type\n", __func__);
+                printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
 }
 module_init(esp4_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 5cd75e2dab2..92fc5f69f5d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/types.h>
@@ -31,7 +32,6 @@
 #include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
-#include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -86,24 +86,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
        tb = fib_trie_table(id);
        if (!tb)
                return NULL;
-        switch (id) {
-        case RT_TABLE_LOCAL:
-                net->ipv4.fib_local = tb;
-                break;
-        case RT_TABLE_MAIN:
-                net->ipv4.fib_main = tb;
-                break;
-        case RT_TABLE_DEFAULT:
-                net->ipv4.fib_default = tb;
-                break;
-        default:
-                break;
-        }
        h = id & (FIB_TABLE_HASHSZ - 1);
        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
        return tb;
@@ -148,20 +130,20 @@ static void fib_flush(struct net *net)
        }
        if (flushed)
-                rt_cache_flush(net);
+                rt_cache_flush(net, -1);
 }
 /*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
-static inline unsigned int __inet_dev_addr_type(struct net *net,
+static inline unsigned __inet_dev_addr_type(struct net *net,
-                                                const struct net_device *dev,
+                                            const struct net_device *dev,
-                                                __be32 addr)
+                                            __be32 addr)
 {
        struct flowi4           fl4 = { .daddr = addr };
        struct fib_result       res;
-        unsigned int ret = RTN_BROADCAST;
+        unsigned ret = RTN_BROADCAST;
        struct fib_table *local_table;
        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
@@ -169,6 +151,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
        if (ipv4_is_multicast(addr))
                return RTN_MULTICAST;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.r = NULL;
+#endif
        local_table = fib_get_table(net, RT_TABLE_LOCAL);
        if (local_table) {
                ret = RTN_UNICAST;
@@ -195,44 +181,6 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
-__be32 fib_compute_spec_dst(struct sk_buff *skb)
-{
-        struct net_device *dev = skb->dev;
-        struct in_device *in_dev;
-        struct fib_result res;
-        struct rtable *rt;
-        struct flowi4 fl4;
-        struct net *net;
-        int scope;
-        rt = skb_rtable(skb);
-        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
-            RTCF_LOCAL)
-                return ip_hdr(skb)->daddr;
-        in_dev = __in_dev_get_rcu(dev);
-        BUG_ON(!in_dev);
-        net = dev_net(dev);
-        scope = RT_SCOPE_UNIVERSE;
-        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
-                fl4.flowi4_oif = 0;
-                fl4.flowi4_iif = LOOPBACK_IFINDEX;
-                fl4.daddr = ip_hdr(skb)->saddr;
-                fl4.saddr = 0;
-                fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
-                fl4.flowi4_scope = scope;
-                fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-                if (!fib_lookup(net, &fl4, &res))
-                        return FIB_RES_PREFSRC(net, res);
-        } else {
-                scope = RT_SCOPE_LINK;
-        }
-        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
-}
 /* Given (packet source, input interface) and optional (dst, oif, tos):
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
@@ -241,15 +189,17 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 * - check, that packet arrived from expected physical interface.
 * called with rcu_read_lock()
 */
-static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
-                                 u8 tos, int oif, struct net_device *dev,
+                        int oif, struct net_device *dev, __be32 *spec_dst,
-                                 int rpf, struct in_device *idev, u32 *itag)
+                        u32 *itag)
 {
-        int ret, no_addr, accept_local;
+        struct in_device *in_dev;
-        struct fib_result res;
        struct flowi4 fl4;
-        struct net *net;
+        struct fib_result res;
+        int no_addr, rpf, accept_local;
        bool dev_match;
+        int ret;
+        struct net *net;
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = oif;
@@ -258,10 +208,20 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
-        no_addr = idev->ifa_list == NULL;
+        no_addr = rpf = accept_local = 0;
+        in_dev = __in_dev_get_rcu(dev);
+        if (in_dev) {
+                no_addr = in_dev->ifa_list == NULL;
+                /* Ignore rp_filter for packets protected by IPsec. */
+                rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
+                accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
+                fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+        }
-        accept_local = IN_DEV_ACCEPT_LOCAL(idev);
+        if (in_dev == NULL)
-        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+                goto e_inval;
        net = dev_net(dev);
        if (fib_lookup(net, &fl4, &res))
@@ -270,6 +230,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                if (res.type != RTN_LOCAL || !accept_local)
                        goto e_inval;
        }
+        *spec_dst = FIB_RES_PREFSRC(net, res);
        fib_combine_itag(itag, &res);
        dev_match = false;
@@ -298,14 +259,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
        ret = 0;
        if (fib_lookup(net, &fl4, &res) == 0) {
-                if (res.type == RTN_UNICAST)
+                if (res.type == RTN_UNICAST) {
+                        *spec_dst = FIB_RES_PREFSRC(net, res);
                        ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+                }
        }
        return ret;
 last_resort:
        if (rpf)
                goto e_rpf;
+        *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
        *itag = 0;
        return 0;
@@ -315,21 +279,6 @@ e_rpf:
        return -EXDEV;
 }
-/* Ignore rp_filter for packets protected by IPsec. */
-int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
-                        u8 tos, int oif, struct net_device *dev,
-                        struct in_device *idev, u32 *itag)
-{
-        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
-        if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
-            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
-                *itag = 0;
-                return 0;
-        }
-        return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
-}
 static inline __be32 sk_extract_addr(struct sockaddr *addr)
 {
        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -488,7 +437,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        switch (cmd) {
        case SIOCADDRT:         /* Add a route */
        case SIOCDELRT:         /* Delete a route */
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        return -EPERM;
                if (copy_from_user(&rt, arg, sizeof(rt)))
@@ -558,7 +507,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
        cfg->fc_flags = rtm->rtm_flags;
        cfg->fc_nlflags = nlh->nlmsg_flags;
-        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
+        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
        cfg->fc_nlinfo.nlh = nlh;
        cfg->fc_nlinfo.nl_net = net;
@@ -746,7 +695,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, prefix, mask);
                if (prim == NULL) {
-                        pr_warn("%s: bug: prim == NULL\n", __func__);
+                        printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
                        return;
                }
        }
@@ -792,7 +741,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 #define BRD_OK          2
 #define BRD0_OK         4
 #define BRD1_OK         8
-        unsigned int ok = 0;
+        unsigned ok = 0;
        int subnet = 0;         /* Primary network */
        int gone = 1;           /* Address is missing */
        int same_prefsrc = 0;   /* Another primary with same IP */
@@ -800,11 +749,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
                if (prim == NULL) {
-                        pr_warn("%s: bug: prim == NULL\n", __func__);
+                        printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
                        return;
                }
                if (iprim && iprim != prim) {
-                        pr_warn("%s: bug: iprim != prim\n", __func__);
+                        printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
                        return;
                }
        } else if (!ipv4_is_zeronet(any) &&
@@ -931,6 +880,10 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
                .flowi4_scope = frn->fl_scope,
        };
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.r = NULL;
+#endif
        frn->err = -ENOENT;
        if (tb) {
                local_bh_disable();
@@ -956,7 +909,7 @@ static void nl_fib_input(struct sk_buff *skb)
        struct fib_result_nl *frn;
        struct nlmsghdr *nlh;
        struct fib_table *tb;
-        u32 portid;
+        u32 pid;
        net = sock_net(skb->sk);
        nlh = nlmsg_hdr(skb);
@@ -974,20 +927,17 @@ static void nl_fib_input(struct sk_buff *skb)
        nl_fib_lookup(frn, tb);
-        portid = NETLINK_CB(skb).portid;      /* pid of sending process */
+        pid = NETLINK_CB(skb).pid;      /* pid of sending process */
-        NETLINK_CB(skb).portid = 0;        /* from kernel */
+        NETLINK_CB(skb).pid = 0;        /* from kernel */
        NETLINK_CB(skb).dst_group = 0;  /* unicast */
-        netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
+        netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
 }
 static int __net_init nl_fib_lookup_init(struct net *net)
 {
        struct sock *sk;
-        struct netlink_kernel_cfg cfg = {
+        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
-                .input  = nl_fib_input,
+                                   nl_fib_input, NULL, THIS_MODULE);
-        };
-        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
        if (sk == NULL)
                return -EAFNOSUPPORT;
        net->ipv4.fibnl = sk;
@@ -1000,11 +950,11 @@ static void nl_fib_lookup_exit(struct net *net)
        net->ipv4.fibnl = NULL;
 }
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, int force, int delay)
 {
        if (fib_sync_down_dev(dev, force))
                fib_flush(dev_net(dev));
-        rt_cache_flush(dev_net(dev));
+        rt_cache_flush(dev_net(dev), delay);
        arp_ifdown(dev);
 }
@@ -1021,7 +971,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
                fib_sync_up(dev);
 #endif
                atomic_inc(&net->ipv4.dev_addr_genid);
-                rt_cache_flush(dev_net(dev));
+                rt_cache_flush(dev_net(dev), -1);
                break;
        case NETDEV_DOWN:
                fib_del_ifaddr(ifa, NULL);
@@ -1030,9 +980,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
                        /* Last address was deleted from this interface.
                         * Disable IP.
                         */
-                        fib_disable_ip(dev, 1);
+                        fib_disable_ip(dev, 1, 0);
                } else {
-                        rt_cache_flush(dev_net(dev));
+                        rt_cache_flush(dev_net(dev), -1);
                }
                break;
        }
@@ -1042,16 +992,16 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
        struct net_device *dev = ptr;
-        struct in_device *in_dev;
+        struct in_device *in_dev = __in_dev_get_rtnl(dev);
        struct net *net = dev_net(dev);
        if (event == NETDEV_UNREGISTER) {
-                fib_disable_ip(dev, 2);
+                fib_disable_ip(dev, 2, -1);
-                rt_flush_dev(dev);
                return NOTIFY_DONE;
        }
-        in_dev = __in_dev_get_rtnl(dev);
+        if (!in_dev)
+                return NOTIFY_DONE;
        switch (event) {
        case NETDEV_UP:
@@ -1062,14 +1012,21 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
                fib_sync_up(dev);
 #endif
                atomic_inc(&net->ipv4.dev_addr_genid);
-                rt_cache_flush(net);
+                rt_cache_flush(dev_net(dev), -1);
                break;
        case NETDEV_DOWN:
-                fib_disable_ip(dev, 0);
+                fib_disable_ip(dev, 0, 0);
                break;
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGE:
-                rt_cache_flush(net);
+                rt_cache_flush(dev_net(dev), 0);
+                break;
+        case NETDEV_UNREGISTER_BATCH:
+                /* The batch unregister is only called on the first
+                 * device in the list of devices being unregistered.
+                 * Therefore we should not pass dev_net(dev) in here.
+                 */
+                rt_cache_flush_batch(NULL);
                break;
        }
        return NOTIFY_DONE;
@@ -1134,9 +1091,6 @@ static int __net_init fib_net_init(struct net *net)
 {
        int error;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-        net->ipv4.fib_num_tclassid_users = 0;
-#endif
        error = ip_fib_net_init(net);
        if (error < 0)
                goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d1fce..a53bb1b5b11 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/rcupdate.h>
-#include <linux/export.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
@@ -47,7 +46,14 @@ struct fib4_rule {
 #endif
 };
-int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_CLASSID
+u32 fib_rules_tclass(const struct fib_result *res)
+{
+        return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
+}
+#endif
+int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 {
        struct fib_lookup_arg arg = {
                .result = res,
@@ -56,15 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
        int err;
        err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
-#ifdef CONFIG_IP_ROUTE_CLASSID
+        res->r = arg.rule;
-        if (arg.rule)
-                res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
-        else
-                res->tclassid = 0;
-#endif
        return err;
 }
-EXPORT_SYMBOL_GPL(__fib_lookup);
 static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
                            int flags, struct fib_lookup_arg *arg)
@@ -166,11 +167,8 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                rule4->dst = nla_get_be32(tb[FRA_DST]);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-        if (tb[FRA_FLOW]) {
+        if (tb[FRA_FLOW])
                rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
-                if (rule4->tclassid)
-                        net->ipv4.fib_num_tclassid_users++;
-        }
 #endif
        rule4->src_len = frh->src_len;
@@ -179,24 +177,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
        rule4->dstmask = inet_make_mask(rule4->dst_len);
        rule4->tos = frh->tos;
-        net->ipv4.fib_has_custom_rules = true;
        err = 0;
 errout:
        return err;
 }
-static void fib4_rule_delete(struct fib_rule *rule)
-{
-        struct net *net = rule->fr_net;
-#ifdef CONFIG_IP_ROUTE_CLASSID
-        struct fib4_rule *rule4 = (struct fib4_rule *) rule;
-        if (rule4->tclassid)
-                net->ipv4.fib_num_tclassid_users--;
-#endif
-        net->ipv4.fib_has_custom_rules = true;
-}
 static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                             struct nlattr **tb)
 {
@@ -234,15 +219,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
        frh->src_len = rule4->src_len;
        frh->tos = rule4->tos;
-        if ((rule4->dst_len &&
+        if (rule4->dst_len)
-             nla_put_be32(skb, FRA_DST, rule4->dst)) ||
+                NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
-            (rule4->src_len &&
-             nla_put_be32(skb, FRA_SRC, rule4->src)))
+        if (rule4->src_len)
-                goto nla_put_failure;
+                NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-        if (rule4->tclassid &&
+        if (rule4->tclassid)
-            nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+                NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
-                goto nla_put_failure;
 #endif
        return 0;
@@ -259,17 +244,16 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
 static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
 {
-        rt_cache_flush(ops->fro_net);
+        rt_cache_flush(ops->fro_net, -1);
 }
-static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
+static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
        .family         = AF_INET,
        .rule_size      = sizeof(struct fib4_rule),
        .addr_size      = sizeof(u32),
        .action         = fib4_rule_action,
        .match          = fib4_rule_match,
        .configure      = fib4_rule_configure,
-        .delete         = fib4_rule_delete,
        .compare        = fib4_rule_compare,
        .fill           = fib4_rule_fill,
        .default_pref   = fib_default_rule_pref,
@@ -309,7 +293,6 @@ int __net_init fib4_rules_init(struct net *net)
        if (err < 0)
                goto fail;
        net->ipv4.rules_ops = ops;
-        net->ipv4.fib_has_custom_rules = false;
        return 0;
 fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4797a800faf..80106d89d54 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -14,6 +14,7 @@
 */
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -140,77 +141,11 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
        },
 };
-static void rt_fibinfo_free(struct rtable __rcu **rtp)
-{
-        struct rtable *rt = rcu_dereference_protected(*rtp, 1);
-        if (!rt)
-                return;
-        /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
-         * because we waited an RCU grace period before calling
-         * free_fib_info_rcu()
-         */
-        dst_free(&rt->dst);
-}
-static void free_nh_exceptions(struct fib_nh *nh)
-{
-        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
-        int i;
-        for (i = 0; i < FNHE_HASH_SIZE; i++) {
-                struct fib_nh_exception *fnhe;
-                fnhe = rcu_dereference_protected(hash[i].chain, 1);
-                while (fnhe) {
-                        struct fib_nh_exception *next;
-                        
-                        next = rcu_dereference_protected(fnhe->fnhe_next, 1);
-                        rt_fibinfo_free(&fnhe->fnhe_rth);
-                        kfree(fnhe);
-                        fnhe = next;
-                }
-        }
-        kfree(hash);
-}
-static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
-{
-        int cpu;
-        if (!rtp)
-                return;
-        for_each_possible_cpu(cpu) {
-                struct rtable *rt;
-                rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
-                if (rt)
-                        dst_free(&rt->dst);
-        }
-        free_percpu(rtp);
-}
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
        struct fib_info *fi = container_of(head, struct fib_info, rcu);
-        change_nexthops(fi) {
-                if (nexthop_nh->nh_dev)
-                        dev_put(nexthop_nh->nh_dev);
-                if (nexthop_nh->nh_exceptions)
-                        free_nh_exceptions(nexthop_nh);
-                rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
-                rt_fibinfo_free(&nexthop_nh->nh_rth_input);
-        } endfor_nexthops(fi);
-        release_net(fi->fib_net);
        if (fi->fib_metrics != (u32 *) dst_default_metrics)
                kfree(fi->fib_metrics);
        kfree(fi);
@@ -219,16 +154,16 @@ static void free_fib_info_rcu(struct rcu_head *head)
 void free_fib_info(struct fib_info *fi)
 {
        if (fi->fib_dead == 0) {
-                pr_warn("Freeing alive fib_info %p\n", fi);
+                pr_warning("Freeing alive fib_info %p\n", fi);
                return;
        }
-        fib_info_cnt--;
-#ifdef CONFIG_IP_ROUTE_CLASSID
        change_nexthops(fi) {
-                if (nexthop_nh->nh_tclassid)
+                if (nexthop_nh->nh_dev)
-                        fi->fib_net->ipv4.fib_num_tclassid_users--;
+                        dev_put(nexthop_nh->nh_dev);
+                nexthop_nh->nh_dev = NULL;
        } endfor_nexthops(fi);
-#endif
+        fib_info_cnt--;
+        release_net(fi->fib_net);
        call_rcu(&fi->rcu, free_fib_info_rcu);
 }
@@ -314,7 +249,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
                    nfi->fib_scope == fi->fib_scope &&
                    nfi->fib_prefsrc == fi->fib_prefsrc &&
                    nfi->fib_priority == fi->fib_priority &&
-                    nfi->fib_type == fi->fib_type &&
                    memcmp(nfi->fib_metrics, fi->fib_metrics,
                           sizeof(u32) * RTAX_MAX) == 0 &&
                    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
@@ -392,7 +326,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
        if (skb == NULL)
                goto errout;
-        err = fib_dump_info(skb, info->portid, seq, event, tb_id,
+        err = fib_dump_info(skb, info->pid, seq, event, tb_id,
                            fa->fa_type, key, dst_len,
                            fa->fa_tos, fa->fa_info, nlm_flags);
        if (err < 0) {
@@ -401,7 +335,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
                kfree_skb(skb);
                goto errout;
        }
-        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+        rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
                    info->nlh, GFP_KERNEL);
        return;
 errout:
@@ -488,8 +422,6 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 #ifdef CONFIG_IP_ROUTE_CLASSID
                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
-                        if (nexthop_nh->nh_tclassid)
-                                fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
                }
@@ -803,7 +735,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                unsigned int bytes;
                if (!new_size)
-                        new_size = 16;
+                        new_size = 1;
                bytes = new_size * sizeof(struct hlist_head *);
                new_info_hash = fib_info_hash_alloc(bytes);
                new_laddrhash = fib_info_hash_alloc(bytes);
@@ -834,14 +766,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
        fi->fib_flags = cfg->fc_flags;
        fi->fib_priority = cfg->fc_priority;
        fi->fib_prefsrc = cfg->fc_prefsrc;
-        fi->fib_type = cfg->fc_type;
        fi->fib_nhs = nhs;
        change_nexthops(fi) {
                nexthop_nh->nh_parent = fi;
-                nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
-                if (!nexthop_nh->nh_pcpu_rth_output)
-                        goto failure;
        } endfor_nexthops(fi)
        if (cfg->fc_mx) {
@@ -852,16 +780,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                        int type = nla_type(nla);
                        if (type) {
-                                u32 val;
                                if (type > RTAX_MAX)
                                        goto err_inval;
-                                val = nla_get_u32(nla);
+                                fi->fib_metrics[type - 1] = nla_get_u32(nla);
-                                if (type == RTAX_ADVMSS && val > 65535 - 40)
-                                        val = 65535 - 40;
-                                if (type == RTAX_MTU && val > 65535 - 15)
-                                        val = 65535 - 15;
-                                fi->fib_metrics[type - 1] = val;
                        }
                }
        }
@@ -890,8 +811,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
                nh->nh_flags = cfg->fc_flags;
 #ifdef CONFIG_IP_ROUTE_CLASSID
                nh->nh_tclassid = cfg->fc_flow;
-                if (nh->nh_tclassid)
-                        fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
                nh->nh_weight = 1;
@@ -993,14 +912,14 @@ failure:
        return ERR_PTR(err);
 }
-int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
                  struct fib_info *fi, unsigned int flags)
 {
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
-        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -1013,36 +932,33 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                rtm->rtm_table = tb_id;
        else
                rtm->rtm_table = RT_TABLE_COMPAT;
-        if (nla_put_u32(skb, RTA_TABLE, tb_id))
+        NLA_PUT_U32(skb, RTA_TABLE, tb_id);
-                goto nla_put_failure;
        rtm->rtm_type = type;
        rtm->rtm_flags = fi->fib_flags;
        rtm->rtm_scope = fi->fib_scope;
        rtm->rtm_protocol = fi->fib_protocol;
-        if (rtm->rtm_dst_len &&
+        if (rtm->rtm_dst_len)
-            nla_put_be32(skb, RTA_DST, dst))
+                NLA_PUT_BE32(skb, RTA_DST, dst);
-                goto nla_put_failure;
-        if (fi->fib_priority &&
+        if (fi->fib_priority)
-            nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+                NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
-                goto nla_put_failure;
        if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
                goto nla_put_failure;
-        if (fi->fib_prefsrc &&
+        if (fi->fib_prefsrc)
-            nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc))
+                NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
-                goto nla_put_failure;
        if (fi->fib_nhs == 1) {
-                if (fi->fib_nh->nh_gw &&
+                if (fi->fib_nh->nh_gw)
-                    nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+                        NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
-                        goto nla_put_failure;
-                if (fi->fib_nh->nh_oif &&
+                if (fi->fib_nh->nh_oif)
-                    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+                        NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-                        goto nla_put_failure;
 #ifdef CONFIG_IP_ROUTE_CLASSID
-                if (fi->fib_nh[0].nh_tclassid &&
+                if (fi->fib_nh[0].nh_tclassid)
-                    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+                        NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
-                        goto nla_put_failure;
 #endif
        }
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1063,13 +979,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                        rtnh->rtnh_hops = nh->nh_weight - 1;
                        rtnh->rtnh_ifindex = nh->nh_oif;
-                        if (nh->nh_gw &&
+                        if (nh->nh_gw)
-                            nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw))
+                                NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-                                goto nla_put_failure;
 #ifdef CONFIG_IP_ROUTE_CLASSID
-                        if (nh->nh_tclassid &&
+                        if (nh->nh_tclassid)
-                            nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+                                NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
-                                goto nla_put_failure;
 #endif
                        /* length of rtnetlink header + attributes */
                        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31d771ca9a7..de9e2978476 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -51,6 +51,7 @@
 #define VERSION "0.409"
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -72,7 +73,6 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/prefetch.h>
-#include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -159,6 +159,7 @@ struct trie {
 #endif
 };
+static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
                                  int wasfull);
 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
@@ -367,7 +368,7 @@ static void __leaf_free_rcu(struct rcu_head *head)
 static inline void free_leaf(struct leaf *l)
 {
-        call_rcu(&l->rcu, __leaf_free_rcu);
+        call_rcu_bh(&l->rcu, __leaf_free_rcu);
 }
 static inline void free_leaf_info(struct leaf_info *leaf)
@@ -472,7 +473,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
        }
        pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
-                 sizeof(struct rt_trie_node *) << bits);
+                 sizeof(struct rt_trie_node) << bits);
        return tn;
 }
@@ -489,7 +490,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
        return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 }
-static inline void put_child(struct tnode *tn, int i,
+static inline void put_child(struct trie *t, struct tnode *tn, int i,
                             struct rt_trie_node *n)
 {
        tnode_put_child_reorg(tn, i, n, -1);
@@ -753,8 +754,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                                goto nomem;
                        }
-                        put_child(tn, 2*i, (struct rt_trie_node *) left);
+                        put_child(t, tn, 2*i, (struct rt_trie_node *) left);
-                        put_child(tn, 2*i+1, (struct rt_trie_node *) right);
+                        put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
                }
        }
@@ -775,9 +776,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                        if (tkey_extract_bits(node->key,
                                              oldtnode->pos + oldtnode->bits,
                                              1) == 0)
-                                put_child(tn, 2*i, node);
+                                put_child(t, tn, 2*i, node);
                        else
-                                put_child(tn, 2*i+1, node);
+                                put_child(t, tn, 2*i+1, node);
                        continue;
                }
@@ -785,8 +786,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                inode = (struct tnode *) node;
                if (inode->bits == 1) {
-                        put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
+                        put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
-                        put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
+                        put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
                        tnode_free_safe(inode);
                        continue;
@@ -816,22 +817,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                 */
                left = (struct tnode *) tnode_get_child(tn, 2*i);
-                put_child(tn, 2*i, NULL);
+                put_child(t, tn, 2*i, NULL);
                BUG_ON(!left);
                right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-                put_child(tn, 2*i+1, NULL);
+                put_child(t, tn, 2*i+1, NULL);
                BUG_ON(!right);
                size = tnode_child_length(left);
                for (j = 0; j < size; j++) {
-                        put_child(left, j, rtnl_dereference(inode->child[j]));
+                        put_child(t, left, j, rtnl_dereference(inode->child[j]));
-                        put_child(right, j, rtnl_dereference(inode->child[j + size]));
+                        put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
                }
-                put_child(tn, 2*i, resize(t, left));
+                put_child(t, tn, 2*i, resize(t, left));
-                put_child(tn, 2*i+1, resize(t, right));
+                put_child(t, tn, 2*i+1, resize(t, right));
                tnode_free_safe(inode);
        }
@@ -876,7 +877,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
                        if (!newn)
                                goto nomem;
-                        put_child(tn, i/2, (struct rt_trie_node *)newn);
+                        put_child(t, tn, i/2, (struct rt_trie_node *)newn);
                }
        }
@@ -891,21 +892,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
                if (left == NULL) {
                        if (right == NULL)    /* Both are empty */
                                continue;
-                        put_child(tn, i/2, right);
+                        put_child(t, tn, i/2, right);
                        continue;
                }
                if (right == NULL) {
-                        put_child(tn, i/2, left);
+                        put_child(t, tn, i/2, left);
                        continue;
                }
                /* Two nonempty children */
                newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
-                put_child(tn, i/2, NULL);
+                put_child(t, tn, i/2, NULL);
-                put_child(newBinNode, 0, left);
+                put_child(t, newBinNode, 0, left);
-                put_child(newBinNode, 1, right);
+                put_child(t, newBinNode, 1, right);
-                put_child(tn, i/2, resize(t, newBinNode));
+                put_child(t, tn, i/2, resize(t, newBinNode));
        }
        tnode_free_safe(oldtnode);
        return tn;
@@ -1006,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
        while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
                wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
-                tn = (struct tnode *)resize(t, tn);
+                tn = (struct tnode *) resize(t, (struct tnode *)tn);
-                tnode_put_child_reorg(tp, cindex,
+                tnode_put_child_reorg((struct tnode *)tp, cindex,
                                      (struct rt_trie_node *)tn, wasfull);
                tp = node_parent((struct rt_trie_node *) tn);
@@ -1023,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
        /* Handle last (top) tnode */
        if (IS_TNODE(tn))
-                tn = (struct tnode *)resize(t, tn);
+                tn = (struct tnode *)resize(t, (struct tnode *)tn);
        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
        tnode_free_flush();
@@ -1124,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
                node_set_parent((struct rt_trie_node *)l, tp);
                cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-                put_child(tp, cindex, (struct rt_trie_node *)l);
+                put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
        } else {
                /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
                /*
@@ -1154,12 +1155,13 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
                node_set_parent((struct rt_trie_node *)tn, tp);
                missbit = tkey_extract_bits(key, newpos, 1);
-                put_child(tn, missbit, (struct rt_trie_node *)l);
+                put_child(t, tn, missbit, (struct rt_trie_node *)l);
-                put_child(tn, 1-missbit, n);
+                put_child(t, tn, 1-missbit, n);
                if (tp) {
                        cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-                        put_child(tp, cindex, (struct rt_trie_node *)tn);
+                        put_child(t, (struct tnode *)tp, cindex,
+                                  (struct rt_trie_node *)tn);
                } else {
                        rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
                        tp = tn;
@@ -1167,8 +1169,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
        }
        if (tp && tp->pos + tp->bits > 32)
-                pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+                pr_warning("fib_trie"
-                        tp, tp->pos, tp->bits, key, plen);
+                           " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+                           tp, tp->pos, tp->bits, key, plen);
        /* Rebalance the trie */
@@ -1286,7 +1289,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
                        fib_release_info(fi_drop);
                        if (state & FA_S_ACCESSED)
-                                rt_cache_flush(cfg->fc_nlinfo.nl_net);
+                                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
                        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
                                tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
@@ -1333,7 +1336,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
        list_add_tail_rcu(&new_fa->fa_list,
                          (fa ? &fa->fa_list : fa_head));
-        rt_cache_flush(cfg->fc_nlinfo.nl_net);
+        rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
                  &cfg->fc_nlinfo, 0);
 succeeded:
@@ -1368,8 +1371,6 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
                        if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
                                continue;
-                        if (fi->fib_dead)
-                                continue;
                        if (fa->fa_info->fib_scope < flp->flowi4_scope)
                                continue;
                        fib_alias_accessed(fa);
@@ -1550,8 +1551,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
                 * state.directly.
                 */
                if (pref_mismatch) {
-                        /* fls(x) = __fls(x) + 1 */
+                        int mp = KEYLENGTH - fls(pref_mismatch);
-                        int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
                        if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
                                goto backtrace;
@@ -1606,7 +1606,6 @@ found:
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(fib_table_lookup);
 /*
 * Remove the leaf and return parent.
@@ -1619,10 +1618,10 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
        if (tp) {
                t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
-                put_child(tp, cindex, NULL);
+                put_child(t, (struct tnode *)tp, cindex, NULL);
                trie_rebalance(t, tp);
        } else
-                RCU_INIT_POINTER(t->trie, NULL);
+                rcu_assign_pointer(t->trie, NULL);
        free_leaf(l);
 }
@@ -1656,12 +1655,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
        if (!l)
                return -ESRCH;
-        li = find_leaf_info(l, plen);
+        fa_head = get_fa_head(l, plen);
-        if (!li)
-                return -ESRCH;
-        fa_head = &li->falh;
        fa = fib_find_alias(fa_head, tos, 0);
        if (!fa)
@@ -1697,6 +1691,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
        rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
                  &cfg->fc_nlinfo, 0);
+        l = fib_find_node(t, key);
+        li = find_leaf_info(l, plen);
        list_del_rcu(&fa->fa_list);
        if (!plen)
@@ -1711,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
                trie_leaf_remove(t, l);
        if (fa->fa_state & FA_S_ACCESSED)
-                rt_cache_flush(cfg->fc_nlinfo.nl_net);
+                rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
        fib_release_info(fa->fa_info);
        alias_free_mem_rcu(fa);
@@ -1873,7 +1870,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
                        continue;
                }
-                if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+                if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
                                  cb->nlh->nlmsg_seq,
                                  RTM_NEWROUTE,
                                  tb->tb_id,
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a491055c7..dbfc21de347 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -10,8 +10,6 @@
 *
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/kmod.h>
@@ -36,7 +34,7 @@ int gre_add_protocol(const struct gre_protocol *proto, u8 version)
        if (gre_proto[version])
                goto err_out_unlock;
-        RCU_INIT_POINTER(gre_proto[version], proto);
+        rcu_assign_pointer(gre_proto[version], proto);
        spin_unlock(&gre_proto_lock);
        return 0;
@@ -56,7 +54,7 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
        if (rcu_dereference_protected(gre_proto[version],
                        lockdep_is_held(&gre_proto_lock)) != proto)
                goto err_out_unlock;
-        RCU_INIT_POINTER(gre_proto[version], NULL);
+        rcu_assign_pointer(gre_proto[version], NULL);
        spin_unlock(&gre_proto_lock);
        synchronize_rcu();
        return 0;
@@ -120,10 +118,10 @@ static const struct net_protocol net_gre_protocol = {
 static int __init gre_init(void)
 {
-        pr_info("GRE over IPv4 demultiplexor driver\n");
+        pr_info("GRE over IPv4 demultiplexor driver");
        if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
-                pr_err("can't add protocol\n");
+                pr_err("gre: can't add protocol\n");
                return -EAGAIN;
        }
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 17ff9fd7cdd..23ef31baa1a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -62,8 +62,6 @@
 *
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/jiffies.h>
@@ -91,11 +89,11 @@
 #include <linux/errno.h>
 #include <linux/timer.h>
 #include <linux/init.h>
+#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
-#include <net/ip_fib.h>
 /*
 *      Build xmit assembly blocks
@@ -254,11 +252,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
        /* Limit if icmp type is enabled in ratemask. */
        if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
-                struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+                if (!rt->peer)
-                rc = inet_peer_xrlim_allow(peer,
+                        rt_bind_peer(rt, fl4->daddr, 1);
+                rc = inet_peer_xrlim_allow(rt->peer,
                                           net->ipv4.sysctl_icmp_ratelimit);
-                if (peer)
-                        inet_putpeer(peer);
        }
 out:
        return rc;
@@ -336,7 +333,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        struct flowi4 fl4;
        struct sock *sk;
        struct inet_sock *inet;
-        __be32 daddr, saddr;
+        __be32 daddr;
        if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
                return;
@@ -350,7 +347,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        inet->tos = ip_hdr(skb)->tos;
        daddr = ipc.addr = ip_hdr(skb)->saddr;
-        saddr = fib_compute_spec_dst(skb);
        ipc.opt = NULL;
        ipc.tx_flags = 0;
        if (icmp_param->replyopts.opt.opt.optlen) {
@@ -360,7 +356,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
        }
        memset(&fl4, 0, sizeof(fl4));
        fl4.daddr = daddr;
-        fl4.saddr = saddr;
+        fl4.saddr = rt->rt_spec_dst;
        fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
        fl4.flowi4_proto = IPPROTO_ICMP;
        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -572,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
                rcu_read_lock();
                if (rt_is_input_route(rt) &&
                    net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
-                        dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+                        dev = dev_get_by_index_rcu(net, rt->rt_iif);
                if (dev)
                        saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -635,27 +631,6 @@ out:;
 EXPORT_SYMBOL(icmp_send);
-static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
-{
-        const struct iphdr *iph = (const struct iphdr *) skb->data;
-        const struct net_protocol *ipprot;
-        int protocol = iph->protocol;
-        /* Checkin full IP header plus 8 bytes of protocol to
-         * avoid additional coding at protocol handlers.
-         */
-        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
-                return;
-        raw_icmp_error(skb, protocol, info);
-        rcu_read_lock();
-        ipprot = rcu_dereference(inet_protos[protocol]);
-        if (ipprot && ipprot->err_handler)
-                ipprot->err_handler(skb, info);
-        rcu_read_unlock();
-}
 /*
 *      Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
 */
@@ -664,8 +639,10 @@ static void icmp_unreach(struct sk_buff *skb)
 {
        const struct iphdr *iph;
        struct icmphdr *icmph;
-        struct net *net;
+        int hash, protocol;
+        const struct net_protocol *ipprot;
        u32 info = 0;
+        struct net *net;
        net = dev_net(skb_dst(skb)->dev);
@@ -693,16 +670,18 @@ static void icmp_unreach(struct sk_buff *skb)
                        break;
                case ICMP_FRAG_NEEDED:
                        if (ipv4_config.no_pmtu_disc) {
-                                LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
+                                LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n",
                                               &iph->daddr);
                        } else {
-                                info = ntohs(icmph->un.frag.mtu);
+                                info = ip_rt_frag_needed(net, iph,
+                                                         ntohs(icmph->un.frag.mtu),
+                                                         skb->dev);
                                if (!info)
                                        goto out;
                        }
                        break;
                case ICMP_SR_FAILED:
-                        LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"),
+                        LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n",
                                       &iph->daddr);
                        break;
                default:
@@ -733,14 +712,37 @@ static void icmp_unreach(struct sk_buff *skb)
        if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
            inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
-                net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+                if (net_ratelimit())
-                                     &ip_hdr(skb)->saddr,
+                        printk(KERN_WARNING "%pI4 sent an invalid ICMP "
-                                     icmph->type, icmph->code,
+                                            "type %u, code %u "
-                                     &iph->daddr, skb->dev->name);
+                                            "error to a broadcast: %pI4 on %s\n",
+                               &ip_hdr(skb)->saddr,
+                               icmph->type, icmph->code,
+                               &iph->daddr,
+                               skb->dev->name);
                goto out;
        }
-        icmp_socket_deliver(skb, info);
+        /* Checkin full IP header plus 8 bytes of protocol to
+         * avoid additional coding at protocol handlers.
+         */
+        if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+                goto out;
+        iph = (const struct iphdr *)skb->data;
+        protocol = iph->protocol;
+        /*
+         *      Deliver ICMP message to raw sockets. Pretty useless feature?
+         */
+        raw_icmp_error(skb, protocol, info);
+        hash = protocol & (MAX_INET_PROTOS - 1);
+        rcu_read_lock();
+        ipprot = rcu_dereference(inet_protos[hash]);
+        if (ipprot && ipprot->err_handler)
+                ipprot->err_handler(skb, info);
+        rcu_read_unlock();
 out:
        return;
@@ -756,15 +758,46 @@ out_err:
 static void icmp_redirect(struct sk_buff *skb)
 {
-        if (skb->len < sizeof(struct iphdr)) {
+        const struct iphdr *iph;
-                ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
-                return;
+        if (skb->len < sizeof(struct iphdr))
-        }
+                goto out_err;
+        /*
+         *      Get the copied header of the packet that caused the redirect
+         */
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-                return;
+                goto out;
+        iph = (const struct iphdr *)skb->data;
-        icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+        switch (icmp_hdr(skb)->code & 7) {
+        case ICMP_REDIR_NET:
+        case ICMP_REDIR_NETTOS:
+                /*
+                 * As per RFC recommendations now handle it as a host redirect.
+                 */
+        case ICMP_REDIR_HOST:
+        case ICMP_REDIR_HOSTTOS:
+                ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
+                               icmp_hdr(skb)->un.gateway,
+                               iph->saddr, skb->dev);
+                break;
+        }
+        /* Ping wants to see redirects.
+         * Let's pretend they are errors of sorts... */
+        if (iph->protocol == IPPROTO_ICMP &&
+            iph->ihl >= 5 &&
+            pskb_may_pull(skb, (iph->ihl<<2)+8)) {
+                ping_err(skb, icmp_hdr(skb)->un.gateway);
+        }
+out:
+        return;
+out_err:
+        ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+        goto out;
 }
 /*
@@ -838,6 +871,87 @@ out_err:
        goto out;
 }
+/*
+ *      Handle ICMP_ADDRESS_MASK requests.  (RFC950)
+ *
+ * RFC1122 (3.2.2.9).  A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent.  Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off.  Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9).   A router MUST implement it.
+ *                      A router SHOULD have switch turning it on/off.
+ *                      This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+static void icmp_address(struct sk_buff *skb)
+{
+#if 0
+        if (net_ratelimit())
+                printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+/*
+ * RFC1812 (4.3.3.9).   A router SHOULD listen all replies, and complain
+ *                      loudly if an inconsistency is found.
+ * called with rcu_read_lock()
+ */
+static void icmp_address_reply(struct sk_buff *skb)
+{
+        struct rtable *rt = skb_rtable(skb);
+        struct net_device *dev = skb->dev;
+        struct in_device *in_dev;
+        struct in_ifaddr *ifa;
+        if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
+                return;
+        in_dev = __in_dev_get_rcu(dev);
+        if (!in_dev)
+                return;
+        if (in_dev->ifa_list &&
+            IN_DEV_LOG_MARTIANS(in_dev) &&
+            IN_DEV_FORWARD(in_dev)) {
+                __be32 _mask, *mp;
+                mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
+                BUG_ON(mp == NULL);
+                for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+                        if (*mp == ifa->ifa_mask &&
+                            inet_ifa_match(ip_hdr(skb)->saddr, ifa))
+                                break;
+                }
+                if (!ifa && net_ratelimit()) {
+                        printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
+                               mp, dev->name, &ip_hdr(skb)->saddr);
+                }
+        }
+}
 static void icmp_discard(struct sk_buff *skb)
 {
 }
@@ -1001,10 +1115,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
                .handler = icmp_discard,
        },
        [ICMP_ADDRESS] = {
-                .handler = icmp_discard,
+                .handler = icmp_address,
        },
        [ICMP_ADDRESSREPLY] = {
-                .handler = icmp_discard,
+                .handler = icmp_address_reply,
        },
 };
@@ -1038,9 +1152,10 @@ static int __net_init icmp_sk_init(struct net *net)
                net->ipv4.icmp_sk[i] = sk;
                /* Enough space for 2 64K ICMP packets, including
-                 * sk_buff/skb_shared_info struct overhead.
+                 * sk_buff struct overhead.
                 */
-                sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+                sk->sk_sndbuf =
+                        (2 * ((64 * 1024) + sizeof(struct sk_buff)));
                /*
                 * Speedup sock_wfree()
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 736ab70fd17..e0d42dbb33f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -73,6 +73,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
@@ -303,11 +304,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
        struct igmpv3_report *pig;
        struct net *net = dev_net(dev);
        struct flowi4 fl4;
-        int hlen = LL_RESERVED_SPACE(dev);
-        int tlen = dev->needed_tailroom;
        while (1) {
-                skb = alloc_skb(size + hlen + tlen,
+                skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
                                GFP_ATOMIC | __GFP_NOWARN);
                if (skb)
                        break;
@@ -328,7 +327,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
        skb_dst_set(skb, &rt->dst);
        skb->dev = dev;
-        skb_reserve(skb, hlen);
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
        skb_reset_network_header(skb);
        pip = ip_hdr(skb);
@@ -344,10 +343,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
        pip->protocol = IPPROTO_IGMP;
        pip->tot_len  = 0;      /* filled in later */
        ip_select_ident(pip, &rt->dst, NULL);
-        ((u8 *)&pip[1])[0] = IPOPT_RA;
+        ((u8*)&pip[1])[0] = IPOPT_RA;
-        ((u8 *)&pip[1])[1] = 4;
+        ((u8*)&pip[1])[1] = 4;
-        ((u8 *)&pip[1])[2] = 0;
+        ((u8*)&pip[1])[2] = 0;
-        ((u8 *)&pip[1])[3] = 0;
+        ((u8*)&pip[1])[3] = 0;
        skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
        skb_put(skb, sizeof(*pig));
@@ -648,7 +647,6 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        __be32  group = pmc ? pmc->multiaddr : 0;
        struct flowi4 fl4;
        __be32  dst;
-        int hlen, tlen;
        if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
                return igmpv3_send_report(in_dev, pmc);
@@ -663,9 +661,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        if (IS_ERR(rt))
                return -1;
-        hlen = LL_RESERVED_SPACE(dev);
+        skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
-        tlen = dev->needed_tailroom;
-        skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
        if (skb == NULL) {
                ip_rt_put(rt);
                return -1;
@@ -673,7 +669,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        skb_dst_set(skb, &rt->dst);
-        skb_reserve(skb, hlen);
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
@@ -688,10 +684,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        iph->saddr    = fl4.saddr;
        iph->protocol = IPPROTO_IGMP;
        ip_select_ident(iph, &rt->dst, NULL);
-        ((u8 *)&iph[1])[0] = IPOPT_RA;
+        ((u8*)&iph[1])[0] = IPOPT_RA;
-        ((u8 *)&iph[1])[1] = 4;
+        ((u8*)&iph[1])[1] = 4;
-        ((u8 *)&iph[1])[2] = 0;
+        ((u8*)&iph[1])[2] = 0;
-        ((u8 *)&iph[1])[3] = 0;
+        ((u8*)&iph[1])[3] = 0;
        ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
        ih->type = type;
@@ -774,7 +770,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
                        if (psf->sf_count[MCAST_INCLUDE] ||
                            pmc->sfcount[MCAST_EXCLUDE] !=
                            psf->sf_count[MCAST_EXCLUDE])
-                                break;
+                                continue;
                        if (srcs[i] == psf->sf_inaddr) {
                                scount++;
                                break;
@@ -815,15 +811,14 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
        return 1;
 }
-/* return true if packet was dropped */
+static void igmp_heard_report(struct in_device *in_dev, __be32 group)
-static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 {
        struct ip_mc_list *im;
        /* Timers are only set for non-local groups */
        if (group == IGMP_ALL_HOSTS)
-                return false;
+                return;
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, im) {
@@ -833,11 +828,9 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
                }
        }
        rcu_read_unlock();
-        return false;
 }
-/* return true if packet was dropped */
+static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
-static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
        int len)
 {
        struct igmphdr          *ih = igmp_hdr(skb);
@@ -869,7 +862,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                /* clear deleted report items */
                igmpv3_clear_delrec(in_dev);
        } else if (len < 12) {
-                return true;    /* ignore bogus packet; freed by caller */
+                return; /* ignore bogus packet; freed by caller */
        } else if (IGMP_V1_SEEN(in_dev)) {
                /* This is a v3 query with v1 queriers present */
                max_delay = IGMP_Query_Response_Interval;
@@ -886,13 +879,13 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                        max_delay = 1;  /* can't mod w/ 0 */
        } else { /* v3 */
                if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
-                        return true;
+                        return;
                ih3 = igmpv3_query_hdr(skb);
                if (ih3->nsrcs) {
                        if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
                                           + ntohs(ih3->nsrcs)*sizeof(__be32)))
-                                return true;
+                                return;
                        ih3 = igmpv3_query_hdr(skb);
                }
@@ -904,9 +897,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                        in_dev->mr_qrv = ih3->qrv;
                if (!group) { /* general query */
                        if (ih3->nsrcs)
-                                return false;   /* no sources allowed */
+                                return; /* no sources allowed */
                        igmp_gq_start_timer(in_dev);
-                        return false;
+                        return;
                }
                /* mark sources to include, if group & source-specific */
                mark = ih3->nsrcs != 0;
@@ -942,7 +935,6 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
                        igmp_mod_timer(im, max_delay);
        }
        rcu_read_unlock();
-        return false;
 }
 /* called in rcu_read_lock() section */
@@ -952,7 +944,6 @@ int igmp_rcv(struct sk_buff *skb)
        struct igmphdr *ih;
        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
        int len = skb->len;
-        bool dropped = true;
        if (in_dev == NULL)
                goto drop;
@@ -974,7 +965,7 @@ int igmp_rcv(struct sk_buff *skb)
        ih = igmp_hdr(skb);
        switch (ih->type) {
        case IGMP_HOST_MEMBERSHIP_QUERY:
-                dropped = igmp_heard_query(in_dev, skb, len);
+                igmp_heard_query(in_dev, skb, len);
                break;
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -984,7 +975,7 @@ int igmp_rcv(struct sk_buff *skb)
                /* don't rely on MC router hearing unicast reports */
                if (skb->pkt_type == PACKET_MULTICAST ||
                    skb->pkt_type == PACKET_BROADCAST)
-                        dropped = igmp_heard_report(in_dev, ih->group);
+                        igmp_heard_report(in_dev, ih->group);
                break;
        case IGMP_PIM:
 #ifdef CONFIG_IP_PIMSM_V1
@@ -1002,10 +993,7 @@ int igmp_rcv(struct sk_buff *skb)
        }
 drop:
-        if (dropped)
+        kfree_skb(skb);
-                kfree_skb(skb);
-        else
-                consume_skb(skb);
        return 0;
 }
@@ -1023,7 +1011,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
        /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
           We will get multicast token leakage, when IFF_MULTICAST
-           is changed. This check should be done in ndo_set_rx_mode
+           is changed. This check should be done in dev->set_multicast_list
           routine. Something sort of:
           if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
           --ANK
@@ -1588,7 +1576,7 @@ out_unlock:
 * Add multicast single-source filter to the interface list
 */
 static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
-        __be32 *psfsrc)
+        __be32 *psfsrc, int delta)
 {
        struct ip_sf_list *psf, *psf_prev;
@@ -1723,15 +1711,14 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                pmc->sfcount[sfmode]++;
        err = 0;
        for (i=0; i<sfcount; i++) {
-                err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
+                err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
                if (err)
                        break;
        }
        if (err) {
                int j;
-                if (!delta)
+                pmc->sfcount[sfmode]--;
-                        pmc->sfcount[sfmode]--;
                for (j=0; j<i; j++)
                        (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
        } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
@@ -1850,7 +1837,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
        }
        err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
                        iml->sfmode, psf->sl_count, psf->sl_addr, 0);
-        RCU_INIT_POINTER(iml->sflist, NULL);
+        rcu_assign_pointer(iml->sflist, NULL);
        /* decrease mem now to avoid the memleak warning */
        atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
        kfree_rcu(psf, rcu);
@@ -1904,7 +1891,6 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
        rtnl_unlock();
        return ret;
 }
-EXPORT_SYMBOL(ip_mc_leave_group);
 int ip_mc_source(int add, int omode, struct sock *sk, struct
        ip_mreq_source *mreqs, int ifindex)
@@ -2444,8 +2430,6 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
                struct ip_mc_list *im = (struct ip_mc_list *)v;
                struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
                char   *querier;
-                long delta;
 #ifdef CONFIG_IP_MULTICAST
                querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
                          IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2459,12 +2443,11 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
                                   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
                }
-                delta = im->timer.expires - jiffies;
                seq_printf(seq,
                           "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
                           im->multiaddr, im->users,
-                           im->tm_running,
+                           im->tm_running, im->tm_running ?
-                           im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
+                           jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
                           im->reporter);
        }
        return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f00d52..c14d88ad348 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -42,8 +42,7 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports);
 void inet_get_local_port_range(int *low, int *high)
 {
-        unsigned int seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -54,7 +53,7 @@ void inet_get_local_port_range(int *low, int *high)
 EXPORT_SYMBOL(inet_get_local_port_range);
 int inet_csk_bind_conflict(const struct sock *sk,
-                           const struct inet_bind_bucket *tb, bool relax)
+                           const struct inet_bind_bucket *tb)
 {
        struct sock *sk2;
        struct hlist_node *node;
@@ -80,14 +79,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
                                    sk2_rcv_saddr == sk_rcv_saddr(sk))
                                        break;
                        }
-                        if (!relax && reuse && sk2->sk_reuse &&
-                            sk2->sk_state != TCP_LISTEN) {
-                                const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
-                                if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
-                                    sk2_rcv_saddr == sk_rcv_saddr(sk))
-                                        break;
-                        }
                }
        }
        return node != NULL;
@@ -131,16 +122,12 @@ again:
                                            (tb->num_owners < smallest_size || smallest_size == -1)) {
                                                smallest_size = tb->num_owners;
                                                smallest_rover = rover;
-                                                if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+                                                if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
-                                                    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+                                                        spin_unlock(&head->lock);
                                                        snum = smallest_rover;
-                                                        goto tb_found;
+                                                        goto have_snum;
                                                }
                                        }
-                                        if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-                                                snum = rover;
-                                                goto tb_found;
-                                        }
                                        goto next;
                                }
                        break;
@@ -182,22 +169,18 @@ have_snum:
        goto tb_not_found;
 tb_found:
        if (!hlist_empty(&tb->owners)) {
-                if (sk->sk_reuse == SK_FORCE_REUSE)
-                        goto success;
                if (tb->fastreuse > 0 &&
                    sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
                    smallest_size == -1) {
                        goto success;
                } else {
                        ret = 1;
-                        if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+                        if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
                                if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
                                    smallest_size != -1 && --attempts >= 0) {
                                        spin_unlock(&head->lock);
                                        goto again;
                                }
                                goto fail_unlock;
                        }
                }
@@ -283,9 +266,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
-        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct sock *newsk;
-        struct request_sock *req;
        int error;
        lock_sock(sk);
@@ -298,7 +279,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                goto out_err;
        /* Find already established connection */
-        if (reqsk_queue_empty(queue)) {
+        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
                /* If this is a non blocking socket don't sleep */
@@ -310,32 +291,14 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
                if (error)
                        goto out_err;
        }
-        req = reqsk_queue_remove(queue);
-        newsk = req->sk;
+        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+        WARN_ON(newsk->sk_state == TCP_SYN_RECV);
-        sk_acceptq_removed(sk);
-        if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
-                spin_lock_bh(&queue->fastopenq->lock);
-                if (tcp_rsk(req)->listener) {
-                        /* We are still waiting for the final ACK from 3WHS
-                         * so can't free req now. Instead, we set req->sk to
-                         * NULL to signify that the child socket is taken
-                         * so reqsk_fastopen_remove() will free the req
-                         * when 3WHS finishes (or is aborted).
-                         */
-                        req->sk = NULL;
-                        req = NULL;
-                }
-                spin_unlock_bh(&queue->fastopenq->lock);
-        }
 out:
        release_sock(sk);
-        if (req)
-                __reqsk_free(req);
        return newsk;
 out_err:
        newsk = NULL;
-        req = NULL;
        *err = error;
        goto out;
 }
@@ -394,19 +357,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct ip_options_rcu *opt = inet_rsk(req)->opt;
        struct net *net = sock_net(sk);
-        int flags = inet_sk_flowi_flags(sk);
        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
-                           sk->sk_protocol,
+                           sk->sk_protocol, inet_sk_flowi_flags(sk),
-                           flags,
                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
                           ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
        security_req_classify_flow(req, flowi4_to_flowi(fl4));
        rt = ip_route_output_flow(net, fl4, sk);
        if (IS_ERR(rt))
                goto no_route;
-        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+        if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
                goto route_err;
        return &rt->dst;
@@ -424,15 +385,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct inet_sock *newinet = inet_sk(newsk);
-        struct ip_options_rcu *opt;
+        struct ip_options_rcu *opt = ireq->opt;
        struct net *net = sock_net(sk);
        struct flowi4 *fl4;
        struct rtable *rt;
        fl4 = &newinet->cork.fl.u.ip4;
-        rcu_read_lock();
-        opt = rcu_dereference(newinet->inet_opt);
        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
                           sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -442,15 +400,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
        rt = ip_route_output_flow(net, fl4, sk);
        if (IS_ERR(rt))
                goto no_route;
-        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+        if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
                goto route_err;
-        rcu_read_unlock();
        return &rt->dst;
 route_err:
        ip_rt_put(rt);
 no_route:
-        rcu_read_unlock();
        IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
        return NULL;
 }
@@ -462,7 +418,7 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
        return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
 }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #define AF_INET_FAMILY(fam) ((fam) == AF_INET)
 #else
 #define AF_INET_FAMILY(fam) 1
@@ -521,31 +477,21 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
                                  int *expire, int *resend)
 {
        if (!rskq_defer_accept) {
-                *expire = req->num_timeout >= thresh;
+                *expire = req->retrans >= thresh;
                *resend = 1;
                return;
        }
-        *expire = req->num_timeout >= thresh &&
+        *expire = req->retrans >= thresh &&
-                  (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
+                  (!inet_rsk(req)->acked || req->retrans >= max_retries);
        /*
         * Do not resend while waiting for data after ACK,
         * start to resend on end of deferring period to give
         * last chance for data or ACK to create established socket.
         */
        *resend = !inet_rsk(req)->acked ||
-                  req->num_timeout >= rskq_defer_accept - 1;
+                  req->retrans >= rskq_defer_accept - 1;
 }
-int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
-{
-        int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
-        if (!err)
-                req->num_retrans++;
-        return err;
-}
-EXPORT_SYMBOL(inet_rtx_syn_ack);
 void inet_csk_reqsk_queue_prune(struct sock *parent,
                                const unsigned long interval,
                                const unsigned long timeout,
@@ -565,7 +511,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
        /* Normally all the openreqs are young and become mature
         * (i.e. converted to established socket) for first timeout.
-         * If synack was not acknowledged for 1 second, it means
+         * If synack was not acknowledged for 3 seconds, it means
         * one of the following things: synack was lost, ack was lost,
         * rtt is high or nobody planned to ack (i.e. synflood).
         * When server is a bit loaded, queue is populated with old
@@ -606,17 +552,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
                                syn_ack_recalc(req, thresh, max_retries,
                                               queue->rskq_defer_accept,
                                               &expire, &resend);
-                                req->rsk_ops->syn_ack_timeout(parent, req);
+                                if (req->rsk_ops->syn_ack_timeout)
+                                        req->rsk_ops->syn_ack_timeout(parent, req);
                                if (!expire &&
                                    (!resend ||
-                                     !inet_rtx_syn_ack(parent, req) ||
+                                     !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
                                     inet_rsk(req)->acked)) {
                                        unsigned long timeo;
-                                        if (req->num_timeout++ == 0)
+                                        if (req->retrans++ == 0)
                                                lopt->qlen_young--;
-                                        timeo = min(timeout << req->num_timeout,
+                                        timeo = min((timeout << req->retrans), max_rto);
-                                                    max_rto);
                                        req->expires = now + timeo;
                                        reqp = &req->dl_next;
                                        continue;
@@ -642,19 +588,10 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 }
 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
-/**
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
- *      inet_csk_clone_lock - clone an inet socket, and lock its clone
+                            const gfp_t priority)
- *      @sk: the socket to clone
- *      @req: request_sock
- *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
- *
- *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
- */
-struct sock *inet_csk_clone_lock(const struct sock *sk,
-                                 const struct request_sock *req,
-                                 const gfp_t priority)
 {
-        struct sock *newsk = sk_clone_lock(sk, priority);
+        struct sock *newsk = sk_clone(sk, priority);
        if (newsk != NULL) {
                struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -678,7 +615,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
        }
        return newsk;
 }
-EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
+EXPORT_SYMBOL_GPL(inet_csk_clone);
 /*
 * At this point, there should be no process reference to this
@@ -710,22 +647,6 @@ void inet_csk_destroy_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
-/* This function allows to force a closure of a socket after the call to
- * tcp/dccp_create_openreq_child().
- */
-void inet_csk_prepare_forced_close(struct sock *sk)
-{
-        /* sk_clone_lock locked the socket and set refcnt to 2 */
-        bh_unlock_sock(sk);
-        sock_put(sk);
-        /* The below has to be done to allow calling inet_csk_destroy_sock */
-        sock_set_flag(sk, SOCK_DEAD);
-        percpu_counter_inc(sk->sk_prot->orphan_count);
-        inet_sk(sk)->inet_num = 0;
-}
-EXPORT_SYMBOL(inet_csk_prepare_forced_close);
 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
 {
        struct inet_sock *inet = inet_sk(sk);
@@ -767,14 +688,13 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 void inet_csk_listen_stop(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
-        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct request_sock *acc_req;
        struct request_sock *req;
        inet_csk_delete_keepalive_timer(sk);
        /* make all the listen_opt local to us */
-        acc_req = reqsk_queue_yank_acceptq(queue);
+        acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
        /* Following specs, it would be better either to send FIN
         * (and enter FIN-WAIT-1, it is normal close)
@@ -784,7 +704,7 @@ void inet_csk_listen_stop(struct sock *sk)
         * To be honest, we are not able to make either
         * of the variants now.                 --ANK
         */
-        reqsk_queue_destroy(queue);
+        reqsk_queue_destroy(&icsk->icsk_accept_queue);
        while ((req = acc_req) != NULL) {
                struct sock *child = req->sk;
@@ -802,19 +722,6 @@ void inet_csk_listen_stop(struct sock *sk)
                percpu_counter_inc(sk->sk_prot->orphan_count);
-                if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
-                        BUG_ON(tcp_sk(child)->fastopen_rsk != req);
-                        BUG_ON(sk != tcp_rsk(req)->listener);
-                        /* Paranoid, to prevent race condition if
-                         * an inbound pkt destined for child is
-                         * blocked by sock lock in tcp_v4_rcv().
-                         * Also to satisfy an assertion in
-                         * tcp_v4_destroy_sock().
-                         */
-                        tcp_sk(child)->fastopen_rsk = NULL;
-                        sock_put(sk);
-                }
                inet_csk_destroy_sock(child);
                bh_unlock_sock(child);
@@ -824,17 +731,6 @@ void inet_csk_listen_stop(struct sock *sk)
                sk_acceptq_removed(sk);
                __reqsk_free(req);
        }
-        if (queue->fastopenq != NULL) {
-                /* Free all the reqs queued in rskq_rst_head. */
-                spin_lock_bh(&queue->fastopenq->lock);
-                acc_req = queue->fastopenq->rskq_rst_head;
-                queue->fastopenq->rskq_rst_head = NULL;
-                spin_unlock_bh(&queue->fastopenq->lock);
-                while ((req = acc_req) != NULL) {
-                        acc_req = req->dl_next;
-                        __reqsk_free(req);
-                }
-        }
        WARN_ON(sk->sk_ack_backlog);
 }
 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
@@ -877,49 +773,3 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 }
 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 #endif
-static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
-{
-        const struct inet_sock *inet = inet_sk(sk);
-        const struct ip_options_rcu *inet_opt;
-        __be32 daddr = inet->inet_daddr;
-        struct flowi4 *fl4;
-        struct rtable *rt;
-        rcu_read_lock();
-        inet_opt = rcu_dereference(inet->inet_opt);
-        if (inet_opt && inet_opt->opt.srr)
-                daddr = inet_opt->opt.faddr;
-        fl4 = &fl->u.ip4;
-        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
-                                   inet->inet_saddr, inet->inet_dport,
-                                   inet->inet_sport, sk->sk_protocol,
-                                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
-        if (IS_ERR(rt))
-                rt = NULL;
-        if (rt)
-                sk_setup_caps(sk, &rt->dst);
-        rcu_read_unlock();
-        return &rt->dst;
-}
-struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
-{
-        struct dst_entry *dst = __sk_dst_check(sk, 0);
-        struct inet_sock *inet = inet_sk(sk);
-        if (!dst) {
-                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
-                if (!dst)
-                        goto out;
-        }
-        dst->ops->update_pmtu(dst, sk, NULL, mtu);
-        dst = __sk_dst_check(sk, 0);
-        if (!dst)
-                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
-out:
-        return dst;
-}
-EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3c788..389a2e6a17f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -33,7 +33,6 @@
 #include <linux/stddef.h>
 #include <linux/inet_diag.h>
-#include <linux/sock_diag.h>
 static const struct inet_diag_handler **inet_diag_table;
@@ -44,25 +43,26 @@ struct inet_diag_entry {
        u16 dport;
        u16 family;
        u16 userlocks;
-#if IS_ENABLED(CONFIG_IPV6)
-        struct in6_addr saddr_storage;  /* for IPv4-mapped-IPv6 addresses */
-        struct in6_addr daddr_storage;  /* for IPv4-mapped-IPv6 addresses */
-#endif
 };
+static struct sock *idiagnl;
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+        RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
 static DEFINE_MUTEX(inet_diag_table_mutex);
-static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
+static const struct inet_diag_handler *inet_diag_lock_handler(int type)
 {
-        if (!inet_diag_table[proto])
+        if (!inet_diag_table[type])
-                request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+                request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
-                               NETLINK_SOCK_DIAG, AF_INET, proto);
+                               NETLINK_INET_DIAG, type);
        mutex_lock(&inet_diag_table_mutex);
-        if (!inet_diag_table[proto])
+        if (!inet_diag_table[type])
                return ERR_PTR(-ENOENT);
-        return inet_diag_table[proto];
+        return inet_diag_table[type];
 }
 static inline void inet_diag_unlock_handler(
@@ -71,91 +71,68 @@ static inline void inet_diag_unlock_handler(
        mutex_unlock(&inet_diag_table_mutex);
 }
-int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+static int inet_csk_diag_fill(struct sock *sk,
-                              struct sk_buff *skb, struct inet_diag_req_v2 *req,
+                              struct sk_buff *skb,
-                              struct user_namespace *user_ns,                   
+                              int ext, u32 pid, u32 seq, u16 nlmsg_flags,
-                              u32 portid, u32 seq, u16 nlmsg_flags,
                              const struct nlmsghdr *unlh)
 {
        const struct inet_sock *inet = inet_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_diag_msg *r;
        struct nlmsghdr  *nlh;
-        struct nlattr *attr;
        void *info = NULL;
+        struct inet_diag_meminfo  *minfo = NULL;
+        unsigned char    *b = skb_tail_pointer(skb);
        const struct inet_diag_handler *handler;
-        int ext = req->idiag_ext;
-        handler = inet_diag_table[req->sdiag_protocol];
+        handler = inet_diag_table[unlh->nlmsg_type];
        BUG_ON(handler == NULL);
-        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+        nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
-                        nlmsg_flags);
+        nlh->nlmsg_flags = nlmsg_flags;
-        if (!nlh)
-                return -EMSGSIZE;
-        r = nlmsg_data(nlh);
+        r = NLMSG_DATA(nlh);
        BUG_ON(sk->sk_state == TCP_TIME_WAIT);
+        if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+                minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
+        if (ext & (1 << (INET_DIAG_INFO - 1)))
+                info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+                                     handler->idiag_info_size);
+        if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+                const size_t len = strlen(icsk->icsk_ca_ops->name);
+                strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+                       icsk->icsk_ca_ops->name);
+        }
        r->idiag_family = sk->sk_family;
        r->idiag_state = sk->sk_state;
        r->idiag_timer = 0;
        r->idiag_retrans = 0;
        r->id.idiag_if = sk->sk_bound_dev_if;
-        sock_diag_save_cookie(sk, r->id.idiag_cookie);
+        r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+        r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
        r->id.idiag_sport = inet->inet_sport;
        r->id.idiag_dport = inet->inet_dport;
        r->id.idiag_src[0] = inet->inet_rcv_saddr;
        r->id.idiag_dst[0] = inet->inet_daddr;
-        if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
-                goto errout;
-        /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
-         * hence this needs to be included regardless of socket family.
-         */
-        if (ext & (1 << (INET_DIAG_TOS - 1)))
-                if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
-                        goto errout;
-#if IS_ENABLED(CONFIG_IPV6)
        if (r->idiag_family == AF_INET6) {
                const struct ipv6_pinfo *np = inet6_sk(sk);
-                *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr;
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-                *(struct in6_addr *)r->id.idiag_dst = np->daddr;
+                               &np->rcv_saddr);
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
-                if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+                               &np->daddr);
-                        if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
-                                goto errout;
        }
 #endif
-        r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
-        r->idiag_inode = sock_i_ino(sk);
-        if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
-                struct inet_diag_meminfo minfo = {
-                        .idiag_rmem = sk_rmem_alloc_get(sk),
-                        .idiag_wmem = sk->sk_wmem_queued,
-                        .idiag_fmem = sk->sk_forward_alloc,
-                        .idiag_tmem = sk_wmem_alloc_get(sk),
-                };
-                if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
-                        goto errout;
-        }
-        if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
-                if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
-                        goto errout;
-        if (icsk == NULL) {
-                handler->idiag_get_info(sk, r, NULL);
-                goto out;
-        }
 #define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
@@ -176,62 +153,47 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
        }
 #undef EXPIRES_IN_MS
-        if (ext & (1 << (INET_DIAG_INFO - 1))) {
+        r->idiag_uid = sock_i_uid(sk);
-                attr = nla_reserve(skb, INET_DIAG_INFO,
+        r->idiag_inode = sock_i_ino(sk);
-                                   sizeof(struct tcp_info));
-                if (!attr)
-                        goto errout;
-                info = nla_data(attr);
+        if (minfo) {
+                minfo->idiag_rmem = sk_rmem_alloc_get(sk);
+                minfo->idiag_wmem = sk->sk_wmem_queued;
+                minfo->idiag_fmem = sk->sk_forward_alloc;
+                minfo->idiag_tmem = sk_wmem_alloc_get(sk);
        }
-        if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
-                if (nla_put_string(skb, INET_DIAG_CONG,
-                                   icsk->icsk_ca_ops->name) < 0)
-                        goto errout;
        handler->idiag_get_info(sk, r, info);
        if (sk->sk_state < TCP_TIME_WAIT &&
            icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
                icsk->icsk_ca_ops->get_info(sk, ext, skb);
-out:
+        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-        return nlmsg_end(skb, nlh);
+        return skb->len;
-errout:
+rtattr_failure:
-        nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+        nlmsg_trim(skb, b);
        return -EMSGSIZE;
 }
-EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_csk_diag_fill(struct sock *sk,
-                              struct sk_buff *skb, struct inet_diag_req_v2 *req,
-                              struct user_namespace *user_ns,
-                              u32 portid, u32 seq, u16 nlmsg_flags,
-                              const struct nlmsghdr *unlh)
-{
-        return inet_sk_diag_fill(sk, inet_csk(sk),
-                        skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
-}
 static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
-                               struct sk_buff *skb, struct inet_diag_req_v2 *req,
+                               struct sk_buff *skb, int ext, u32 pid,
-                               u32 portid, u32 seq, u16 nlmsg_flags,
+                               u32 seq, u16 nlmsg_flags,
                               const struct nlmsghdr *unlh)
 {
        long tmo;
        struct inet_diag_msg *r;
-        struct nlmsghdr *nlh;
+        const unsigned char *previous_tail = skb_tail_pointer(skb);
+        struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
-        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+                                         unlh->nlmsg_type, sizeof(*r));
-                        nlmsg_flags);
-        if (!nlh)
-                return -EMSGSIZE;
-        r = nlmsg_data(nlh);
+        r = NLMSG_DATA(nlh);
        BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+        nlh->nlmsg_flags = nlmsg_flags;
        tmo = tw->tw_ttd - jiffies;
        if (tmo < 0)
                tmo = 0;
@@ -239,7 +201,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
        r->idiag_family       = tw->tw_family;
        r->idiag_retrans      = 0;
        r->id.idiag_if        = tw->tw_bound_dev_if;
-        sock_diag_save_cookie(tw, r->id.idiag_cookie);
+        r->id.idiag_cookie[0] = (u32)(unsigned long)tw;
+        r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
        r->id.idiag_sport     = tw->tw_sport;
        r->id.idiag_dport     = tw->tw_dport;
        r->id.idiag_src[0]    = tw->tw_rcv_saddr;
@@ -251,49 +214,62 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
        r->idiag_wqueue       = 0;
        r->idiag_uid          = 0;
        r->idiag_inode        = 0;
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
        if (tw->tw_family == AF_INET6) {
                const struct inet6_timewait_sock *tw6 =
                                                inet6_twsk((struct sock *)tw);
-                *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr;
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-                *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr;
+                               &tw6->tw_v6_rcv_saddr);
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+                               &tw6->tw_v6_daddr);
        }
 #endif
+        nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
-        return nlmsg_end(skb, nlh);
+        return skb->len;
+nlmsg_failure:
+        nlmsg_trim(skb, previous_tail);
+        return -EMSGSIZE;
 }
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
-                        struct inet_diag_req_v2 *r,
+                        int ext, u32 pid, u32 seq, u16 nlmsg_flags,
-                        struct user_namespace *user_ns,
-                        u32 portid, u32 seq, u16 nlmsg_flags,
                        const struct nlmsghdr *unlh)
 {
        if (sk->sk_state == TCP_TIME_WAIT)
                return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
-                                           skb, r, portid, seq, nlmsg_flags,
+                                           skb, ext, pid, seq, nlmsg_flags,
                                           unlh);
-        return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh);
+        return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh);
 }
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
+static int inet_diag_get_exact(struct sk_buff *in_skb,
-                const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+                               const struct nlmsghdr *nlh)
 {
        int err;
        struct sock *sk;
+        struct inet_diag_req *req = NLMSG_DATA(nlh);
        struct sk_buff *rep;
-        struct net *net = sock_net(in_skb->sk);
+        struct inet_hashinfo *hashinfo;
+        const struct inet_diag_handler *handler;
+        handler = inet_diag_lock_handler(nlh->nlmsg_type);
+        if (IS_ERR(handler)) {
+                err = PTR_ERR(handler);
+                goto unlock;
+        }
+        hashinfo = handler->idiag_hashinfo;
        err = -EINVAL;
-        if (req->sdiag_family == AF_INET) {
-                sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+        if (req->idiag_family == AF_INET) {
+                sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
                                 req->id.idiag_dport, req->id.idiag_src[0],
                                 req->id.idiag_sport, req->id.idiag_if);
        }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
-        else if (req->sdiag_family == AF_INET6) {
+        else if (req->idiag_family == AF_INET6) {
-                sk = inet6_lookup(net, hashinfo,
+                sk = inet6_lookup(&init_net, hashinfo,
                                  (struct in6_addr *)req->id.idiag_dst,
                                  req->id.idiag_dport,
                                  (struct in6_addr *)req->id.idiag_src,
@@ -302,35 +278,37 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
        }
 #endif
        else {
-                goto out_nosk;
+                goto unlock;
        }
        err = -ENOENT;
        if (sk == NULL)
-                goto out_nosk;
+                goto unlock;
-        err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+        err = -ESTALE;
-        if (err)
+        if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+             req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+            ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+             (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
                goto out;
-        rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+        err = -ENOMEM;
-                        sizeof(struct inet_diag_meminfo) +
+        rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
-                        sizeof(struct tcp_info) + 64, GFP_KERNEL);
+                                     sizeof(struct inet_diag_meminfo) +
-        if (!rep) {
+                                     handler->idiag_info_size + 64)),
-                err = -ENOMEM;
+                        GFP_KERNEL);
+        if (!rep)
                goto out;
-        }
-        err = sk_diag_fill(sk, rep, req,
+        err = sk_diag_fill(sk, rep, req->idiag_ext,
-                           sk_user_ns(NETLINK_CB(in_skb).ssk),
+                           NETLINK_CB(in_skb).pid,
-                           NETLINK_CB(in_skb).portid,
                           nlh->nlmsg_seq, 0, nlh);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
-                nlmsg_free(rep);
+                kfree_skb(rep);
                goto out;
        }
-        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+        err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
                              MSG_DONTWAIT);
        if (err > 0)
                err = 0;
@@ -342,25 +320,8 @@ out:
                else
                        sock_put(sk);
        }
-out_nosk:
+unlock:
-        return err;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-static int inet_diag_get_exact(struct sk_buff *in_skb,
-                               const struct nlmsghdr *nlh,
-                               struct inet_diag_req_v2 *req)
-{
-        const struct inet_diag_handler *handler;
-        int err;
-        handler = inet_diag_lock_handler(req->sdiag_protocol);
-        if (IS_ERR(handler))
-                err = PTR_ERR(handler);
-        else
-                err = handler->dump_one(in_skb, nlh, req);
        inet_diag_unlock_handler(handler);
        return err;
 }
@@ -391,12 +352,9 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
 }
-static int inet_diag_bc_run(const struct nlattr *_bc,
+static int inet_diag_bc_run(const void *bc, int len,
-                const struct inet_diag_entry *entry)
+                            const struct inet_diag_entry *entry)
 {
-        const void *bc = nla_data(_bc);
-        int len = nla_len(_bc);
        while (len > 0) {
                int yes = 1;
                const struct inet_diag_bc_op *op = bc;
@@ -435,31 +393,25 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
                                break;
                        }
+                        if (cond->prefix_len == 0)
+                                break;
                        if (op->code == INET_DIAG_BC_S_COND)
                                addr = entry->saddr;
                        else
                                addr = entry->daddr;
-                        if (cond->family != AF_UNSPEC &&
-                            cond->family != entry->family) {
-                                if (entry->family == AF_INET6 &&
-                                    cond->family == AF_INET) {
-                                        if (addr[0] == 0 && addr[1] == 0 &&
-                                            addr[2] == htonl(0xffff) &&
-                                            bitstring_match(addr + 3,
-                                                            cond->addr,
-                                                            cond->prefix_len))
-                                                break;
-                                }
-                                yes = 0;
-                                break;
-                        }
-                        if (cond->prefix_len == 0)
-                                break;
                        if (bitstring_match(addr, cond->addr,
                                            cond->prefix_len))
                                break;
+                        if (entry->family == AF_INET6 &&
+                            cond->family == AF_INET) {
+                                if (addr[0] == 0 && addr[1] == 0 &&
+                                    addr[2] == htonl(0xffff) &&
+                                    bitstring_match(addr + 3, cond->addr,
+                                                    cond->prefix_len))
+                                        break;
+                        }
                        yes = 0;
                        break;
                }
@@ -476,35 +428,6 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
        return len == 0;
 }
-int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
-{
-        struct inet_diag_entry entry;
-        struct inet_sock *inet = inet_sk(sk);
-        if (bc == NULL)
-                return 1;
-        entry.family = sk->sk_family;
-#if IS_ENABLED(CONFIG_IPV6)
-        if (entry.family == AF_INET6) {
-                struct ipv6_pinfo *np = inet6_sk(sk);
-                entry.saddr = np->rcv_saddr.s6_addr32;
-                entry.daddr = np->daddr.s6_addr32;
-        } else
-#endif
-        {
-                entry.saddr = &inet->inet_rcv_saddr;
-                entry.daddr = &inet->inet_daddr;
-        }
-        entry.sport = inet->inet_num;
-        entry.dport = ntohs(inet->inet_dport);
-        entry.userlocks = sk->sk_userlocks;
-        return inet_diag_bc_run(bc, &entry);
-}
-EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
 static int valid_cc(const void *bc, int len, int cc)
 {
        while (len >= 0) {
@@ -522,55 +445,6 @@ static int valid_cc(const void *bc, int len, int cc)
        return 0;
 }
-/* Validate an inet_diag_hostcond. */
-static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
-                           int *min_len)
-{
-        int addr_len;
-        struct inet_diag_hostcond *cond;
-        /* Check hostcond space. */
-        *min_len += sizeof(struct inet_diag_hostcond);
-        if (len < *min_len)
-                return false;
-        cond = (struct inet_diag_hostcond *)(op + 1);
-        /* Check address family and address length. */
-        switch (cond->family) {
-        case AF_UNSPEC:
-                addr_len = 0;
-                break;
-        case AF_INET:
-                addr_len = sizeof(struct in_addr);
-                break;
-        case AF_INET6:
-                addr_len = sizeof(struct in6_addr);
-                break;
-        default:
-                return false;
-        }
-        *min_len += addr_len;
-        if (len < *min_len)
-                return false;
-        /* Check prefix length (in bits) vs address length (in bytes). */
-        if (cond->prefix_len > 8 * addr_len)
-                return false;
-        return true;
-}
-/* Validate a port comparison operator. */
-static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
-                                         int len, int *min_len)
-{
-        /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
-        *min_len += sizeof(struct inet_diag_bc_op);
-        if (len < *min_len)
-                return false;
-        return true;
-}
 static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
 {
        const void *bc = bytecode;
@@ -578,39 +452,29 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
        while (len > 0) {
                const struct inet_diag_bc_op *op = bc;
-                int min_len = sizeof(struct inet_diag_bc_op);
 //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
                switch (op->code) {
+                case INET_DIAG_BC_AUTO:
                case INET_DIAG_BC_S_COND:
                case INET_DIAG_BC_D_COND:
-                        if (!valid_hostcond(bc, len, &min_len))
-                                return -EINVAL;
-                        break;
                case INET_DIAG_BC_S_GE:
                case INET_DIAG_BC_S_LE:
                case INET_DIAG_BC_D_GE:
                case INET_DIAG_BC_D_LE:
-                        if (!valid_port_comparison(bc, len, &min_len))
+                case INET_DIAG_BC_JMP:
+                        if (op->no < 4 || op->no > len + 4 || op->no & 3)
+                                return -EINVAL;
+                        if (op->no < len &&
+                            !valid_cc(bytecode, bytecode_len, len - op->no))
                                return -EINVAL;
                        break;
-                case INET_DIAG_BC_AUTO:
-                case INET_DIAG_BC_JMP:
                case INET_DIAG_BC_NOP:
                        break;
                default:
                        return -EINVAL;
                }
+                if (op->yes < 4 || op->yes > len + 4 || op->yes & 3)
-                if (op->code != INET_DIAG_BC_NOP) {
-                        if (op->no < min_len || op->no > len + 4 || op->no & 3)
-                                return -EINVAL;
-                        if (op->no < len &&
-                            !valid_cc(bytecode, bytecode_len, len - op->no))
-                                return -EINVAL;
-                }
-                if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
                        return -EINVAL;
                bc  += op->yes;
                len -= op->yes;
@@ -620,30 +484,57 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
 static int inet_csk_diag_dump(struct sock *sk,
                              struct sk_buff *skb,
-                              struct netlink_callback *cb,
+                              struct netlink_callback *cb)
-                              struct inet_diag_req_v2 *r,
-                              const struct nlattr *bc)
 {
-        if (!inet_diag_bc_sk(bc, sk))
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
-                return 0;
-        return inet_csk_diag_fill(sk, skb, r,
+        if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
-                                  sk_user_ns(NETLINK_CB(cb->skb).ssk),
+                struct inet_diag_entry entry;
-                                  NETLINK_CB(cb->skb).portid,
+                const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
+                                                          sizeof(*r),
+                                                          INET_DIAG_REQ_BYTECODE);
+                struct inet_sock *inet = inet_sk(sk);
+                entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                if (entry.family == AF_INET6) {
+                        struct ipv6_pinfo *np = inet6_sk(sk);
+                        entry.saddr = np->rcv_saddr.s6_addr32;
+                        entry.daddr = np->daddr.s6_addr32;
+                } else
+#endif
+                {
+                        entry.saddr = &inet->inet_rcv_saddr;
+                        entry.daddr = &inet->inet_daddr;
+                }
+                entry.sport = inet->inet_num;
+                entry.dport = ntohs(inet->inet_dport);
+                entry.userlocks = sk->sk_userlocks;
+                if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
+                        return 0;
+        }
+        return inet_csk_diag_fill(sk, skb, r->idiag_ext,
+                                  NETLINK_CB(cb->skb).pid,
                                  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
 static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
                               struct sk_buff *skb,
-                               struct netlink_callback *cb,
+                               struct netlink_callback *cb)
-                               struct inet_diag_req_v2 *r,
-                               const struct nlattr *bc)
 {
-        if (bc != NULL) {
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+        if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
                struct inet_diag_entry entry;
+                const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
+                                                          sizeof(*r),
+                                                          INET_DIAG_REQ_BYTECODE);
                entry.family = tw->tw_family;
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
                if (tw->tw_family == AF_INET6) {
                        struct inet6_timewait_sock *tw6 =
                                                inet6_twsk((struct sock *)tw);
@@ -659,70 +550,38 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
                entry.dport = ntohs(tw->tw_dport);
                entry.userlocks = 0;
-                if (!inet_diag_bc_run(bc, &entry))
+                if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
                        return 0;
        }
-        return inet_twsk_diag_fill(tw, skb, r,
+        return inet_twsk_diag_fill(tw, skb, r->idiag_ext,
-                                   NETLINK_CB(cb->skb).portid,
+                                   NETLINK_CB(cb->skb).pid,
                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
-/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
- * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
- */
-static inline void inet_diag_req_addrs(const struct sock *sk,
-                                       const struct request_sock *req,
-                                       struct inet_diag_entry *entry)
-{
-        struct inet_request_sock *ireq = inet_rsk(req);
-#if IS_ENABLED(CONFIG_IPV6)
-        if (sk->sk_family == AF_INET6) {
-                if (req->rsk_ops->family == AF_INET6) {
-                        entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32;
-                        entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32;
-                } else if (req->rsk_ops->family == AF_INET) {
-                        ipv6_addr_set_v4mapped(ireq->loc_addr,
-                                               &entry->saddr_storage);
-                        ipv6_addr_set_v4mapped(ireq->rmt_addr,
-                                               &entry->daddr_storage);
-                        entry->saddr = entry->saddr_storage.s6_addr32;
-                        entry->daddr = entry->daddr_storage.s6_addr32;
-                }
-        } else
-#endif
-        {
-                entry->saddr = &ireq->loc_addr;
-                entry->daddr = &ireq->rmt_addr;
-        }
-}
 static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
-                              struct request_sock *req,
+                              struct request_sock *req, u32 pid, u32 seq,
-                              struct user_namespace *user_ns,
-                              u32 portid, u32 seq,
                              const struct nlmsghdr *unlh)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct inet_sock *inet = inet_sk(sk);
+        unsigned char *b = skb_tail_pointer(skb);
        struct inet_diag_msg *r;
        struct nlmsghdr *nlh;
        long tmo;
-        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+        nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
-                        NLM_F_MULTI);
+        nlh->nlmsg_flags = NLM_F_MULTI;
-        if (!nlh)
+        r = NLMSG_DATA(nlh);
-                return -EMSGSIZE;
-        r = nlmsg_data(nlh);
        r->idiag_family = sk->sk_family;
        r->idiag_state = TCP_SYN_RECV;
        r->idiag_timer = 1;
-        r->idiag_retrans = req->num_retrans;
+        r->idiag_retrans = req->retrans;
        r->id.idiag_if = sk->sk_bound_dev_if;
-        sock_diag_save_cookie(req, r->id.idiag_cookie);
+        r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+        r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
        tmo = req->expires - jiffies;
        if (tmo < 0)
@@ -735,28 +594,33 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
        r->idiag_expires = jiffies_to_msecs(tmo);
        r->idiag_rqueue = 0;
        r->idiag_wqueue = 0;
-        r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+        r->idiag_uid = sock_i_uid(sk);
        r->idiag_inode = 0;
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
        if (r->idiag_family == AF_INET6) {
-                struct inet_diag_entry entry;
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-                inet_diag_req_addrs(sk, req, &entry);
+                               &inet6_rsk(req)->loc_addr);
-                memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
+                ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
-                memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
+                               &inet6_rsk(req)->rmt_addr);
        }
 #endif
+        nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+        return skb->len;
-        return nlmsg_end(skb, nlh);
+nlmsg_failure:
+        nlmsg_trim(skb, b);
+        return -1;
 }
 static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
-                               struct netlink_callback *cb,
+                               struct netlink_callback *cb)
-                               struct inet_diag_req_v2 *r,
-                               const struct nlattr *bc)
 {
        struct inet_diag_entry entry;
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct listen_sock *lopt;
+        const struct nlattr *bc = NULL;
        struct inet_sock *inet = inet_sk(sk);
        int j, s_j;
        int reqnum, s_reqnum;
@@ -776,7 +640,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
        if (!lopt || !lopt->qlen)
                goto out;
-        if (bc != NULL) {
+        if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
+                bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
+                                     INET_DIAG_REQ_BYTECODE);
                entry.sport = inet->inet_num;
                entry.userlocks = sk->sk_userlocks;
        }
@@ -795,16 +661,27 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
                                continue;
                        if (bc) {
-                                inet_diag_req_addrs(sk, req, &entry);
+                                entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                                        (entry.family == AF_INET6) ?
+                                        inet6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+                                        &ireq->loc_addr;
+                                entry.daddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                                        (entry.family == AF_INET6) ?
+                                        inet6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+                                        &ireq->rmt_addr;
                                entry.dport = ntohs(ireq->rmt_port);
-                                if (!inet_diag_bc_run(bc, &entry))
+                                if (!inet_diag_bc_run(nla_data(bc),
+                                                      nla_len(bc), &entry))
                                        continue;
                        }
                        err = inet_diag_fill_req(skb, sk, req,
-                                               sk_user_ns(NETLINK_CB(cb->skb).ssk),
+                                               NETLINK_CB(cb->skb).pid,
-                                               NETLINK_CB(cb->skb).portid,
                                               cb->nlh->nlmsg_seq, cb->nlh);
                        if (err < 0) {
                                cb->args[3] = j + 1;
@@ -822,12 +699,19 @@ out:
        return err;
 }
-void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-                struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
        int i, num;
        int s_i, s_num;
-        struct net *net = sock_net(skb->sk);
+        struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+        const struct inet_diag_handler *handler;
+        struct inet_hashinfo *hashinfo;
+        handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
+        if (IS_ERR(handler))
+                goto unlock;
+        hashinfo = handler->idiag_hashinfo;
        s_i = cb->args[1];
        s_num = num = cb->args[2];
@@ -847,18 +731,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
                        sk_nulls_for_each(sk, node, &ilb->head) {
                                struct inet_sock *inet = inet_sk(sk);
-                                if (!net_eq(sock_net(sk), net))
-                                        continue;
                                if (num < s_num) {
                                        num++;
                                        continue;
                                }
-                                if (r->sdiag_family != AF_UNSPEC &&
-                                                sk->sk_family != r->sdiag_family)
-                                        goto next_listen;
                                if (r->id.idiag_sport != inet->inet_sport &&
                                    r->id.idiag_sport)
                                        goto next_listen;
@@ -868,7 +745,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
                                    cb->args[3] > 0)
                                        goto syn_recv;
-                                if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+                                if (inet_csk_diag_dump(sk, skb, cb) < 0) {
                                        spin_unlock_bh(&ilb->lock);
                                        goto done;
                                }
@@ -877,7 +754,7 @@ syn_recv:
                                if (!(r->idiag_states & TCPF_SYN_RECV))
                                        goto next_listen;
-                                if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+                                if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
                                        spin_unlock_bh(&ilb->lock);
                                        goto done;
                                }
@@ -899,7 +776,7 @@ skip_listen_ht:
        }
        if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
-                goto out;
+                goto unlock;
        for (i = s_i; i <= hashinfo->ehash_mask; i++) {
                struct inet_ehash_bucket *head = &hashinfo->ehash[i];
@@ -920,22 +797,17 @@ skip_listen_ht:
                sk_nulls_for_each(sk, node, &head->chain) {
                        struct inet_sock *inet = inet_sk(sk);
-                        if (!net_eq(sock_net(sk), net))
-                                continue;
                        if (num < s_num)
                                goto next_normal;
                        if (!(r->idiag_states & (1 << sk->sk_state)))
                                goto next_normal;
-                        if (r->sdiag_family != AF_UNSPEC &&
-                                        sk->sk_family != r->sdiag_family)
-                                goto next_normal;
                        if (r->id.idiag_sport != inet->inet_sport &&
                            r->id.idiag_sport)
                                goto next_normal;
                        if (r->id.idiag_dport != inet->inet_dport &&
                            r->id.idiag_dport)
                                goto next_normal;
-                        if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+                        if (inet_csk_diag_dump(sk, skb, cb) < 0) {
                                spin_unlock_bh(lock);
                                goto done;
                        }
@@ -948,21 +820,16 @@ next_normal:
                        inet_twsk_for_each(tw, node,
                                    &head->twchain) {
-                                if (!net_eq(twsk_net(tw), net))
-                                        continue;
                                if (num < s_num)
                                        goto next_dying;
-                                if (r->sdiag_family != AF_UNSPEC &&
-                                                tw->tw_family != r->sdiag_family)
-                                        goto next_dying;
                                if (r->id.idiag_sport != tw->tw_sport &&
                                    r->id.idiag_sport)
                                        goto next_dying;
                                if (r->id.idiag_dport != tw->tw_dport &&
                                    r->id.idiag_dport)
                                        goto next_dying;
-                                if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) {
+                                if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
                                        spin_unlock_bh(lock);
                                        goto done;
                                }
@@ -976,89 +843,15 @@ next_dying:
 done:
        cb->args[1] = i;
        cb->args[2] = num;
-out:
+unlock:
-        ;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
-static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-                struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
-        const struct inet_diag_handler *handler;
-        int err = 0;
-        handler = inet_diag_lock_handler(r->sdiag_protocol);
-        if (!IS_ERR(handler))
-                handler->dump(skb, cb, r, bc);
-        else
-                err = PTR_ERR(handler);
        inet_diag_unlock_handler(handler);
+        return skb->len;
-        return err ? : skb->len;
-}
-static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-        struct nlattr *bc = NULL;
-        int hdrlen = sizeof(struct inet_diag_req_v2);
-        if (nlmsg_attrlen(cb->nlh, hdrlen))
-                bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-        return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
 }
-static inline int inet_diag_type2proto(int type)
+static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-        switch (type) {
-        case TCPDIAG_GETSOCK:
-                return IPPROTO_TCP;
-        case DCCPDIAG_GETSOCK:
-                return IPPROTO_DCCP;
-        default:
-                return 0;
-        }
-}
-static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
-{
-        struct inet_diag_req *rc = nlmsg_data(cb->nlh);
-        struct inet_diag_req_v2 req;
-        struct nlattr *bc = NULL;
        int hdrlen = sizeof(struct inet_diag_req);
-        req.sdiag_family = AF_UNSPEC; /* compatibility */
-        req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
-        req.idiag_ext = rc->idiag_ext;
-        req.idiag_states = rc->idiag_states;
-        req.id = rc->id;
-        if (nlmsg_attrlen(cb->nlh, hdrlen))
-                bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-        return __inet_diag_dump(skb, cb, &req, bc);
-}
-static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
-                               const struct nlmsghdr *nlh)
-{
-        struct inet_diag_req *rc = nlmsg_data(nlh);
-        struct inet_diag_req_v2 req;
-        req.sdiag_family = rc->idiag_family;
-        req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
-        req.idiag_ext = rc->idiag_ext;
-        req.idiag_states = rc->idiag_states;
-        req.id = rc->id;
-        return inet_diag_get_exact(in_skb, nlh, &req);
-}
-static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-        int hdrlen = sizeof(struct inet_diag_req);
-        struct net *net = sock_net(skb->sk);
        if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
            nlmsg_len(nlh) < hdrlen)
                return -EINVAL;
@@ -1074,62 +867,29 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
                            inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
                                return -EINVAL;
                }
-                {
-                        struct netlink_dump_control c = {
-                                .dump = inet_diag_dump_compat,
-                        };
-                        return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
-                }
-        }
-        return inet_diag_get_exact_compat(skb, nlh);
-}
-static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
-{
-        int hdrlen = sizeof(struct inet_diag_req_v2);
-        struct net *net = sock_net(skb->sk);
-        if (nlmsg_len(h) < hdrlen)
+                return netlink_dump_start(idiagnl, skb, nlh,
-                return -EINVAL;
+                                          inet_diag_dump, NULL, 0);
-        if (h->nlmsg_flags & NLM_F_DUMP) {
-                if (nlmsg_attrlen(h, hdrlen)) {
-                        struct nlattr *attr;
-                        attr = nlmsg_find_attr(h, hdrlen,
-                                               INET_DIAG_REQ_BYTECODE);
-                        if (attr == NULL ||
-                            nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
-                            inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
-                                return -EINVAL;
-                }
-                {
-                        struct netlink_dump_control c = {
-                                .dump = inet_diag_dump,
-                        };
-                        return netlink_dump_start(net->diag_nlsk, skb, h, &c);
-                }
        }
-        return inet_diag_get_exact(skb, h, nlmsg_data(h));
+        return inet_diag_get_exact(skb, nlh);
 }
-static const struct sock_diag_handler inet_diag_handler = {
+static DEFINE_MUTEX(inet_diag_mutex);
-        .family = AF_INET,
-        .dump = inet_diag_handler_dump,
-};
-static const struct sock_diag_handler inet6_diag_handler = {
+static void inet_diag_rcv(struct sk_buff *skb)
-        .family = AF_INET6,
+{
-        .dump = inet_diag_handler_dump,
+        mutex_lock(&inet_diag_mutex);
-};
+        netlink_rcv_skb(skb, &inet_diag_rcv_msg);
+        mutex_unlock(&inet_diag_mutex);
+}
 int inet_diag_register(const struct inet_diag_handler *h)
 {
        const __u16 type = h->idiag_type;
        int err = -EINVAL;
-        if (type >= IPPROTO_MAX)
+        if (type >= INET_DIAG_GETSOCK_MAX)
                goto out;
        mutex_lock(&inet_diag_table_mutex);
@@ -1148,7 +908,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
 {
        const __u16 type = h->idiag_type;
-        if (type >= IPPROTO_MAX)
+        if (type >= INET_DIAG_GETSOCK_MAX)
                return;
        mutex_lock(&inet_diag_table_mutex);
@@ -1159,7 +919,7 @@ EXPORT_SYMBOL_GPL(inet_diag_unregister);
 static int __init inet_diag_init(void)
 {
-        const int inet_diag_table_size = (IPPROTO_MAX *
+        const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
                                          sizeof(struct inet_diag_handler *));
        int err = -ENOMEM;
@@ -1167,35 +927,25 @@ static int __init inet_diag_init(void)
        if (!inet_diag_table)
                goto out;
-        err = sock_diag_register(&inet_diag_handler);
+        idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
-        if (err)
+                                        inet_diag_rcv, NULL, THIS_MODULE);
-                goto out_free_nl;
+        if (idiagnl == NULL)
+                goto out_free_table;
-        err = sock_diag_register(&inet6_diag_handler);
+        err = 0;
-        if (err)
-                goto out_free_inet;
-        sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
 out:
        return err;
+out_free_table:
-out_free_inet:
-        sock_diag_unregister(&inet_diag_handler);
-out_free_nl:
        kfree(inet_diag_table);
        goto out;
 }
 static void __exit inet_diag_exit(void)
 {
-        sock_diag_unregister(&inet6_diag_handler);
+        netlink_kernel_release(idiagnl);
-        sock_diag_unregister(&inet_diag_handler);
-        sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
        kfree(inet_diag_table);
 }
 module_init(inet_diag_init);
 module_exit(inet_diag_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG);
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4750d2b74d7..5ff2a51b6d0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
        nf->low_thresh = 0;
        local_bh_disable();
-        inet_frag_evictor(nf, f, true);
+        inet_frag_evictor(nf, f);
        local_bh_enable();
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -158,16 +158,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
 {
        struct inet_frag_queue *q;
        int work, evicted = 0;
-        if (!force) {
-                if (atomic_read(&nf->mem) <= nf->high_thresh)
-                        return 0;
-        }
        work = atomic_read(&nf->mem) - nf->low_thresh;
        while (work > 0) {
                read_lock(&f->lock);
@@ -248,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
        if (q == NULL)
                return NULL;
-        q->net = nf;
        f->constructor(q, arg);
        atomic_add(f->qsize, &nf->mem);
        setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
        spin_lock_init(&q->lock);
        atomic_set(&q->refcnt, 1);
+        q->net = nf;
        return q;
 }
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fa3ae814871..984ec656b03 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -217,7 +217,7 @@ begin:
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
-struct sock *__inet_lookup_established(struct net *net,
+struct sock * __inet_lookup_established(struct net *net,
                                  struct inet_hashinfo *hashinfo,
                                  const __be32 saddr, const __be16 sport,
                                  const __be32 daddr, const u16 hnum,
@@ -237,14 +237,12 @@ struct sock *__inet_lookup_established(struct net *net,
        rcu_read_lock();
 begin:
        sk_nulls_for_each_rcu(sk, node, &head->chain) {
-                if (sk->sk_hash != hash)
+                if (INET_MATCH(sk, net, hash, acookie,
-                        continue;
+                                        saddr, daddr, ports, dif)) {
-                if (likely(INET_MATCH(sk, net, acookie,
-                                      saddr, daddr, ports, dif))) {
                        if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
                                goto begintw;
-                        if (unlikely(!INET_MATCH(sk, net, acookie,
+                        if (unlikely(!INET_MATCH(sk, net, hash, acookie,
-                                                 saddr, daddr, ports, dif))) {
+                                saddr, daddr, ports, dif))) {
                                sock_put(sk);
                                goto begin;
                        }
@@ -262,18 +260,14 @@ begin:
 begintw:
        /* Must check for a TIME_WAIT'er before going to listener hash. */
        sk_nulls_for_each_rcu(sk, node, &head->twchain) {
-                if (sk->sk_hash != hash)
+                if (INET_TW_MATCH(sk, net, hash, acookie,
-                        continue;
+                                        saddr, daddr, ports, dif)) {
-                if (likely(INET_TW_MATCH(sk, net, acookie,
-                                         saddr, daddr, ports,
-                                         dif))) {
                        if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
                                sk = NULL;
                                goto out;
                        }
-                        if (unlikely(!INET_TW_MATCH(sk, net, acookie,
+                        if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
-                                                    saddr, daddr, ports,
+                                 saddr, daddr, ports, dif))) {
-                                                    dif))) {
                                sock_put(sk);
                                goto begintw;
                        }
@@ -320,12 +314,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        /* Check TIME-WAIT sockets first. */
        sk_nulls_for_each(sk2, node, &head->twchain) {
-                if (sk2->sk_hash != hash)
+                tw = inet_twsk(sk2);
-                        continue;
-                if (likely(INET_TW_MATCH(sk2, net, acookie,
+                if (INET_TW_MATCH(sk2, net, hash, acookie,
-                                         saddr, daddr, ports, dif))) {
+                                        saddr, daddr, ports, dif)) {
-                        tw = inet_twsk(sk2);
                        if (twsk_unique(sk, sk2, twp))
                                goto unique;
                        else
@@ -336,10 +328,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        /* And established part... */
        sk_nulls_for_each(sk2, node, &head->chain) {
-                if (sk2->sk_hash != hash)
+                if (INET_MATCH(sk2, net, hash, acookie,
-                        continue;
+                                        saddr, daddr, ports, dif))
-                if (likely(INET_MATCH(sk2, net, acookie,
-                                      saddr, daddr, ports, dif)))
                        goto not_unique;
        }
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cc280a3f4f9..ef7ae6049a5 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -244,11 +244,11 @@ static void lro_add_frags(struct net_lro_desc *lro_desc,
        skb->truesize += truesize;
        skb_frags[0].page_offset += hlen;
-        skb_frag_size_sub(&skb_frags[0], hlen);
+        skb_frags[0].size -= hlen;
        while (tcp_data_len > 0) {
                *(lro_desc->next_frag) = *skb_frags;
-                tcp_data_len -= skb_frag_size(skb_frags);
+                tcp_data_len -= skb_frags->size;
                lro_desc->next_frag++;
                skb_frags++;
                skb_shinfo(skb)->nr_frags++;
@@ -400,14 +400,14 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
        skb_frags = skb_shinfo(skb)->frags;
        while (data_len > 0) {
                *skb_frags = *frags;
-                data_len -= skb_frag_size(frags);
+                data_len -= frags->size;
                skb_frags++;
                frags++;
                skb_shinfo(skb)->nr_frags++;
        }
        skb_shinfo(skb)->frags[0].page_offset += hdr_len;
-        skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len);
+        skb_shinfo(skb)->frags[0].size -= hdr_len;
        skb->ip_summed = ip_summed;
        skb->csum = sum;
@@ -433,7 +433,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
        if (!lro_mgr->get_frag_header ||
            lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
                                     (void *)&tcph, &flags, priv)) {
-                mac_hdr = skb_frag_address(frags);
+                mac_hdr = page_address(frags->page) + frags->page_offset;
                goto out1;
        }
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3155f..3c8dfa16614 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/slab.h>
-#include <linux/module.h>
 #include <net/inet_hashtables.h>
 #include <net/inet_timewait_sock.h>
 #include <net/ip.h>
@@ -89,8 +88,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 #ifdef SOCK_REFCNT_DEBUG
        if (atomic_read(&tw->tw_refcnt) != 1) {
-                pr_debug("%s timewait_sock %p refcnt=%d\n",
+                printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
-                         tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+                       tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
        }
 #endif
        while (refcnt) {
@@ -184,7 +183,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
                tw->tw_daddr        = inet->inet_daddr;
                tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
                tw->tw_bound_dev_if = sk->sk_bound_dev_if;
-                tw->tw_tos          = inet->tos;
                tw->tw_num          = inet->inet_num;
                tw->tw_state        = TCP_TIME_WAIT;
                tw->tw_substate     = state;
@@ -263,7 +261,7 @@ rescan:
 void inet_twdr_hangman(unsigned long data)
 {
        struct inet_timewait_death_row *twdr;
-        unsigned int need_timer;
+        int unsigned need_timer;
        twdr = (struct inet_timewait_death_row *)data;
        spin_lock(&twdr->death_lock);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 000e3d239d6..86f13c67ea8 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,7 +17,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/net.h>
-#include <linux/workqueue.h>
 #include <net/ip.h>
 #include <net/inetpeer.h>
 #include <net/secure_seq.h>
@@ -67,11 +66,6 @@
 static struct kmem_cache *peer_cachep __read_mostly;
-static LIST_HEAD(gc_list);
-static const int gc_delay = 60 * HZ;
-static struct delayed_work gc_work;
-static DEFINE_SPINLOCK(gc_lock);
 #define node_height(x) x->avl_height
 #define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -82,39 +76,23 @@ static const struct inet_peer peer_fake_node = {
        .avl_height     = 0
 };
-void inet_peer_base_init(struct inet_peer_base *bp)
+struct inet_peer_base {
-{
+        struct inet_peer __rcu *root;
-        bp->root = peer_avl_empty_rcu;
+        seqlock_t       lock;
-        seqlock_init(&bp->lock);
+        int             total;
-        bp->flush_seq = ~0U;
+};
-        bp->total = 0;
-}
-EXPORT_SYMBOL_GPL(inet_peer_base_init);
-static atomic_t v4_seq = ATOMIC_INIT(0);
-static atomic_t v6_seq = ATOMIC_INIT(0);
-static atomic_t *inetpeer_seq_ptr(int family)
-{
-        return (family == AF_INET ? &v4_seq : &v6_seq);
-}
-static inline void flush_check(struct inet_peer_base *base, int family)
-{
-        atomic_t *fp = inetpeer_seq_ptr(family);
-        if (unlikely(base->flush_seq != atomic_read(fp))) {
-                inetpeer_invalidate_tree(base);
-                base->flush_seq = atomic_read(fp);
-        }
-}
-void inetpeer_invalidate_family(int family)
+static struct inet_peer_base v4_peers = {
-{
+        .root           = peer_avl_empty_rcu,
-        atomic_t *fp = inetpeer_seq_ptr(family);
+        .lock           = __SEQLOCK_UNLOCKED(v4_peers.lock),
+        .total          = 0,
+};
-        atomic_inc(fp);
+static struct inet_peer_base v6_peers = {
-}
+        .root           = peer_avl_empty_rcu,
+        .lock           = __SEQLOCK_UNLOCKED(v6_peers.lock),
+        .total          = 0,
+};
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
@@ -124,52 +102,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries m
 int inet_peer_minttl __read_mostly = 120 * HZ;  /* TTL under high load: 120 sec */
 int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;      /* usual time to live: 10 min */
-static void inetpeer_gc_worker(struct work_struct *work)
-{
-        struct inet_peer *p, *n, *c;
-        LIST_HEAD(list);
-        spin_lock_bh(&gc_lock);
-        list_replace_init(&gc_list, &list);
-        spin_unlock_bh(&gc_lock);
-        if (list_empty(&list))
-                return;
-        list_for_each_entry_safe(p, n, &list, gc_list) {
-                if (need_resched())
-                        cond_resched();
-                c = rcu_dereference_protected(p->avl_left, 1);
-                if (c != peer_avl_empty) {
-                        list_add_tail(&c->gc_list, &list);
-                        p->avl_left = peer_avl_empty_rcu;
-                }
-                c = rcu_dereference_protected(p->avl_right, 1);
-                if (c != peer_avl_empty) {
-                        list_add_tail(&c->gc_list, &list);
-                        p->avl_right = peer_avl_empty_rcu;
-                }
-                n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
-                if (!atomic_read(&p->refcnt)) {
-                        list_del(&p->gc_list);
-                        kmem_cache_free(peer_cachep, p);
-                }
-        }
-        if (list_empty(&list))
-                return;
-        spin_lock_bh(&gc_lock);
-        list_splice(&list, &gc_list);
-        spin_unlock_bh(&gc_lock);
-        schedule_delayed_work(&gc_work, gc_delay);
-}
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
@@ -194,7 +126,6 @@ void __init inet_initpeers(void)
                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
                        NULL);
-        INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
 }
 static int addr_compare(const struct inetpeer_addr *a,
@@ -205,7 +136,7 @@ static int addr_compare(const struct inetpeer_addr *a,
        for (i = 0; i < n; i++) {
                if (a->addr.a6[i] == b->addr.a6[i])
                        continue;
-                if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
+                if (a->addr.a6[i] < b->addr.a6[i])
                        return -1;
                return 1;
        }
@@ -419,6 +350,11 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
        call_rcu(&p->rcu, inetpeer_free_rcu);
 }
+static struct inet_peer_base *family_to_base(int family)
+{
+        return family == AF_INET ? &v4_peers : &v6_peers;
+}
 /* perform garbage collect on all items stacked during a lookup */
 static int inet_peer_gc(struct inet_peer_base *base,
                        struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -456,17 +392,14 @@ static int inet_peer_gc(struct inet_peer_base *base,
        return cnt;
 }
-struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
-                               const struct inetpeer_addr *daddr,
-                               int create)
 {
        struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
+        struct inet_peer_base *base = family_to_base(daddr->family);
        struct inet_peer *p;
        unsigned int sequence;
        int invalidated, gccnt = 0;
-        flush_check(base, daddr->family);
        /* Attempt a lockless lookup first.
         * Because of a concurrent writer, we might not find an existing entry.
         */
@@ -508,13 +441,14 @@ relookup:
                                (daddr->family == AF_INET) ?
                                        secure_ip_id(daddr->addr.a4) :
                                        secure_ipv6_id(daddr->addr.a6));
+                p->tcp_ts_stamp = 0;
                p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
                p->rate_tokens = 0;
-                /* 60*HZ is arbitrary, but chosen enough high so that the first
+                p->rate_last = 0;
-                 * calculation of tokens is at its maximum.
+                p->pmtu_expires = 0;
-                 */
+                p->pmtu_orig = 0;
-                p->rate_last = jiffies - 60*HZ;
+                memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
-                INIT_LIST_HEAD(&p->gc_list);
                /* Link the node. */
                link_to_pool(p, base);
@@ -574,31 +508,3 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
        return rc;
 }
 EXPORT_SYMBOL(inet_peer_xrlim_allow);
-static void inetpeer_inval_rcu(struct rcu_head *head)
-{
-        struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
-        spin_lock_bh(&gc_lock);
-        list_add_tail(&p->gc_list, &gc_list);
-        spin_unlock_bh(&gc_lock);
-        schedule_delayed_work(&gc_work, gc_delay);
-}
-void inetpeer_invalidate_tree(struct inet_peer_base *base)
-{
-        struct inet_peer *root;
-        write_seqlock_bh(&base->lock);
-        root = rcu_deref_locked(base->root, base);
-        if (root != peer_avl_empty) {
-                base->root = peer_avl_empty_rcu;
-                base->total = 0;
-                call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
-        }
-        write_sequnlock_bh(&base->lock);
-}
-EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 694de3b7aeb..3b34d1c8627 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -41,10 +41,9 @@
 static int ip_forward_finish(struct sk_buff *skb)
 {
-        struct ip_options *opt  = &(IPCB(skb)->opt);
+        struct ip_options * opt = &(IPCB(skb)->opt);
        IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
-        IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
        if (unlikely(opt->optlen))
                ip_forward_options(skb);
@@ -56,7 +55,7 @@ int ip_forward(struct sk_buff *skb)
 {
        struct iphdr *iph;      /* Our header */
        struct rtable *rt;      /* Route we use */
-        struct ip_options *opt  = &(IPCB(skb)->opt);
+        struct ip_options * opt = &(IPCB(skb)->opt);
        if (skb_warn_if_lro(skb))
                goto drop;
@@ -85,7 +84,7 @@ int ip_forward(struct sk_buff *skb)
        rt = skb_rtable(skb);
-        if (opt->is_strictroute && rt->rt_uses_gateway)
+        if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway)
                goto sr_failed;
        if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb9d63a570c..0e0ab98abc6 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -20,8 +20,6 @@
 *              Patrick McHardy :       LRU queue of frag heads for evictor.
 */
-#define pr_fmt(fmt) "IPv4: " fmt
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -148,17 +146,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
        return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
 }
-static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
+static int ip4_frag_match(struct inet_frag_queue *q, void *a)
 {
        struct ipq *qp;
        struct ip4_create_arg *arg = a;
        qp = container_of(q, struct ipq, q);
        return  qp->id == arg->iph->id &&
-                qp->saddr == arg->iph->saddr &&
+                        qp->saddr == arg->iph->saddr &&
-                qp->daddr == arg->iph->daddr &&
+                        qp->daddr == arg->iph->daddr &&
-                qp->protocol == arg->iph->protocol &&
+                        qp->protocol == arg->iph->protocol &&
-                qp->user == arg->user;
+                        qp->user == arg->user;
 }
 /* Memory Tracking Functions. */
@@ -171,10 +169,6 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
 static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 {
        struct ipq *qp = container_of(q, struct ipq, q);
-        struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
-                                               frags);
-        struct net *net = container_of(ipv4, struct net, ipv4);
        struct ip4_create_arg *arg = a;
        qp->protocol = arg->iph->protocol;
@@ -184,7 +178,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
        qp->daddr = arg->iph->daddr;
        qp->user = arg->user;
        qp->peer = sysctl_ipfrag_max_dist ?
-                inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
+                inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
 }
 static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -219,7 +213,7 @@ static void ip_evictor(struct net *net)
 {
        int evicted;
-        evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
+        evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
        if (evicted)
                IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
 }
@@ -305,7 +299,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
        return container_of(q, struct ipq, q);
 out_nomem:
-        LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n"));
+        LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
        return NULL;
 }
@@ -398,7 +392,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
        /* Is this the final fragment? */
        if ((flags & IP_MF) == 0) {
                /* If we already have some bits beyond end
-                 * or have different end, the segment is corrupted.
+                 * or have different end, the segment is corrrupted.
                 */
                if (end < qp->q.len ||
                    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
@@ -523,10 +517,6 @@ found:
        if (offset == 0)
                qp->q.last_in |= INET_FRAG_FIRST_IN;
-        if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
-            skb->len + ihl > qp->q.max_size)
-                qp->q.max_size = skb->len + ihl;
        if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
            qp->q.meat == qp->q.len)
                return ip_frag_reasm(qp, prev, dev);
@@ -553,7 +543,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        int len;
        int ihlen;
        int err;
-        int sum_truesize;
        u8 ecn;
        ipq_kill(qp);
@@ -578,7 +567,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
                skb_morph(head, qp->q.fragments);
                head->next = qp->q.fragments->next;
-                consume_skb(qp->q.fragments);
+                kfree_skb(qp->q.fragments);
                qp->q.fragments = head;
        }
@@ -610,8 +599,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
                head->next = clone;
                skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
                skb_frag_list_init(head);
-                for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+                for (i=0; i<skb_shinfo(head)->nr_frags; i++)
-                        plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+                        plen += skb_shinfo(head)->frags[i].size;
                clone->len = clone->data_len = head->data_len - plen;
                head->data_len -= clone->len;
                head->len -= clone->len;
@@ -620,41 +609,26 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
                atomic_add(clone->truesize, &qp->q.net->mem);
        }
+        skb_shinfo(head)->frag_list = head->next;
        skb_push(head, head->data - skb_network_header(head));
-        sum_truesize = head->truesize;
+        for (fp=head->next; fp; fp = fp->next) {
-        for (fp = head->next; fp;) {
+                head->data_len += fp->len;
-                bool headstolen;
+                head->len += fp->len;
-                int delta;
-                struct sk_buff *next = fp->next;
-                sum_truesize += fp->truesize;
                if (head->ip_summed != fp->ip_summed)
                        head->ip_summed = CHECKSUM_NONE;
                else if (head->ip_summed == CHECKSUM_COMPLETE)
                        head->csum = csum_add(head->csum, fp->csum);
+                head->truesize += fp->truesize;
-                if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
-                        kfree_skb_partial(fp, headstolen);
-                } else {
-                        if (!skb_shinfo(head)->frag_list)
-                                skb_shinfo(head)->frag_list = fp;
-                        head->data_len += fp->len;
-                        head->len += fp->len;
-                        head->truesize += fp->truesize;
-                }
-                fp = next;
        }
-        atomic_sub(sum_truesize, &qp->q.net->mem);
+        atomic_sub(head->truesize, &qp->q.net->mem);
        head->next = NULL;
        head->dev = dev;
        head->tstamp = qp->q.stamp;
-        IPCB(head)->frag_max_size = qp->q.max_size;
        iph = ip_hdr(head);
-        /* max_size != 0 implies at least one fragment had IP_DF set */
+        iph->frag_off = 0;
-        iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
        iph->tot_len = htons(len);
        iph->tos |= ecn;
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -663,12 +637,14 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
        return 0;
 out_nomem:
-        LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
+        LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
-                       qp);
+                              "queue %p\n", qp);
        err = -ENOMEM;
        goto out_fail;
 out_oversize:
-        net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
+        if (net_ratelimit())
+                printk(KERN_INFO "Oversized IP packet from %pI4.\n",
+                        &qp->saddr);
 out_fail:
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
        return err;
@@ -684,7 +660,8 @@ int ip_defrag(struct sk_buff *skb, u32 user)
        IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
        /* Start by cleaning up the memory. */
-        ip_evictor(net);
+        if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
+                ip_evictor(net);
        /* Lookup (or create) queue header */
        if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
@@ -705,41 +682,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 }
 EXPORT_SYMBOL(ip_defrag);
-struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
-{
-        struct iphdr iph;
-        u32 len;
-        if (skb->protocol != htons(ETH_P_IP))
-                return skb;
-        if (!skb_copy_bits(skb, 0, &iph, sizeof(iph)))
-                return skb;
-        if (iph.ihl < 5 || iph.version != 4)
-                return skb;
-        len = ntohs(iph.tot_len);
-        if (skb->len < len || len < (iph.ihl * 4))
-                return skb;
-        if (ip_is_fragment(&iph)) {
-                skb = skb_share_check(skb, GFP_ATOMIC);
-                if (skb) {
-                        if (!pskb_may_pull(skb, iph.ihl*4))
-                                return skb;
-                        if (pskb_trim_rcsum(skb, len))
-                                return skb;
-                        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-                        if (ip_defrag(skb, user))
-                                return NULL;
-                        skb->rxhash = 0;
-                }
-        }
-        return skb;
-}
-EXPORT_SYMBOL(ip_check_defrag);
 #ifdef CONFIG_SYSCTL
 static int zero;
@@ -801,13 +743,9 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
                table[0].data = &net->ipv4.frags.high_thresh;
                table[1].data = &net->ipv4.frags.low_thresh;
                table[2].data = &net->ipv4.frags.timeout;
-                /* Don't export sysctls to unprivileged users */
-                if (net->user_ns != &init_user_ns)
-                        table[0].procname = NULL;
        }
-        hdr = register_net_sysctl(net, "net/ipv4", table);
+        hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
        if (hdr == NULL)
                goto err_reg;
@@ -832,7 +770,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
 static void ip4_frags_ctl_register(void)
 {
-        register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
+        register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
 }
 #else
 static inline int ip4_frags_ns_ctl_register(struct net *net)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 303012adf9e..d7bb94c4834 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -10,8 +10,6 @@
 *
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -48,7 +46,7 @@
 #include <net/rtnetlink.h>
 #include <net/gre.h>
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
@@ -67,7 +65,7 @@
   it is infeasible task. The most general solutions would be
   to keep skb->encapsulation counter (sort of local ttl),
   and silently drop packet when it expires. It is a good
-   solution, but it supposes maintaining new variable in ALL
+   solution, but it supposes maintaing new variable in ALL
   skb, even if no tunneling is used.
   Current solution: xmit_recursion breaks dead loops. This is a percpu
@@ -93,14 +91,14 @@
   One of them is to parse packet trying to detect inner encapsulation
   made by our node. It is difficult or even impossible, especially,
-   taking into account fragmentation. TO be short, ttl is not solution at all.
+   taking into account fragmentation. TO be short, tt is not solution at all.
   Current solution: The solution was UNEXPECTEDLY SIMPLE.
   We force DF flag on tunnels with preconfigured hop limit,
   that is ALL. :-) Well, it does not remove the problem completely,
   but exponential growth of network traffic is changed to linear
   (branches, that exceed pmtu are pruned) and tunnel mtu
-   rapidly degrades to value <68, where looping stops.
+   fastly degrades to value <68, where looping stops.
   Yes, it is not good if there exists a router in the loop,
   which does not force DF, even when encapsulating packets have DF set.
   But it is not our problem! Nobody could accuse us, we made
@@ -120,10 +118,6 @@
   Alexey Kuznetsov.
 */
-static bool log_ecn_error = true;
-module_param(log_ecn_error, bool, 0644);
-MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void ipgre_tunnel_setup(struct net_device *dev);
@@ -164,66 +158,46 @@ struct ipgre_net {
 #define tunnels_r       tunnels[2]
 #define tunnels_l       tunnels[1]
 #define tunnels_wc      tunnels[0]
+/*
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+#define for_each_ip_tunnel_rcu(start) \
+        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
+/* often modified stats are per cpu, other are shared (netdev->stats) */
-                                                   struct rtnl_link_stats64 *tot)
+struct pcpu_tstats {
+        unsigned long   rx_packets;
+        unsigned long   rx_bytes;
+        unsigned long   tx_packets;
+        unsigned long   tx_bytes;
+};
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
 {
+        struct pcpu_tstats sum = { 0 };
        int i;
        for_each_possible_cpu(i) {
                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
-                unsigned int start;
-                do {
-                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
-                        rx_packets = tstats->rx_packets;
-                        tx_packets = tstats->tx_packets;
-                        rx_bytes = tstats->rx_bytes;
-                        tx_bytes = tstats->tx_bytes;
-                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-                tot->rx_packets += rx_packets;
-                tot->tx_packets += tx_packets;
-                tot->rx_bytes   += rx_bytes;
-                tot->tx_bytes   += tx_bytes;
-        }
-        tot->multicast = dev->stats.multicast;
-        tot->rx_crc_errors = dev->stats.rx_crc_errors;
-        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
-        tot->rx_length_errors = dev->stats.rx_length_errors;
-        tot->rx_frame_errors = dev->stats.rx_frame_errors;
-        tot->rx_errors = dev->stats.rx_errors;
-        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
-        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
-        tot->tx_dropped = dev->stats.tx_dropped;
-        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
-        tot->tx_errors = dev->stats.tx_errors;
-        return tot;
+                sum.rx_packets += tstats->rx_packets;
-}
+                sum.rx_bytes   += tstats->rx_bytes;
+                sum.tx_packets += tstats->tx_packets;
-/* Does key in tunnel parameters match packet */
+                sum.tx_bytes   += tstats->tx_bytes;
-static bool ipgre_key_match(const struct ip_tunnel_parm *p,
+        }
-                            __be16 flags, __be32 key)
+        dev->stats.rx_packets = sum.rx_packets;
-{
+        dev->stats.rx_bytes   = sum.rx_bytes;
-        if (p->i_flags & GRE_KEY) {
+        dev->stats.tx_packets = sum.tx_packets;
-                if (flags & GRE_KEY)
+        dev->stats.tx_bytes   = sum.tx_bytes;
-                        return key == p->i_key;
+        return &dev->stats;
-                else
-                        return false;   /* key expected, none present */
-        } else
-                return !(flags & GRE_KEY);
 }
 /* Given src, dst and key, find appropriate for input tunnel. */
-static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
+static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
-                                             __be32 remote, __be32 local,
+                                              __be32 remote, __be32 local,
-                                             __be16 flags, __be32 key,
+                                              __be32 key, __be16 gre_proto)
-                                             __be16 gre_proto)
 {
        struct net *net = dev_net(dev);
        int link = dev->ifindex;
@@ -235,15 +209,13 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
                       ARPHRD_ETHER : ARPHRD_IPGRE;
        int score, cand_score = 4;
-        for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
+        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
                if (local != t->parms.iph.saddr ||
                    remote != t->parms.iph.daddr ||
+                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
-                if (!ipgre_key_match(&t->parms, flags, key))
-                        continue;
                if (t->dev->type != ARPHRD_IPGRE &&
                    t->dev->type != dev_type)
                        continue;
@@ -262,14 +234,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
                }
        }
-        for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
+        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
                if (remote != t->parms.iph.daddr ||
+                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
-                if (!ipgre_key_match(&t->parms, flags, key))
-                        continue;
                if (t->dev->type != ARPHRD_IPGRE &&
                    t->dev->type != dev_type)
                        continue;
@@ -288,16 +258,14 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
                }
        }
-        for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
+        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
                if ((local != t->parms.iph.saddr &&
                     (local != t->parms.iph.daddr ||
                      !ipv4_is_multicast(local))) ||
+                    key != t->parms.i_key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
-                if (!ipgre_key_match(&t->parms, flags, key))
-                        continue;
                if (t->dev->type != ARPHRD_IPGRE &&
                    t->dev->type != dev_type)
                        continue;
@@ -316,7 +284,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
                }
        }
-        for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
+        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
                if (t->parms.i_key != key ||
                    !(t->dev->flags & IFF_UP))
                        continue;
@@ -454,10 +422,6 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
        if (register_netdevice(dev) < 0)
                goto failed_free;
-        /* Can use a lockless transmit, unless we generate output sequences */
-        if (!(nt->parms.o_flags & GRE_SEQ))
-                dev->features |= NETIF_F_LLTX;
        dev_hold(dev);
        ipgre_tunnel_link(ign, nt);
        return nt;
@@ -489,18 +453,17 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
   GRE tunnels with enabled checksum. Tell them "thank you".
   Well, I wonder, rfc1812 was written by Cisco employee,
-   what the hell these idiots break standards established
+   what the hell these idiots break standrads established
-   by themselves???
+   by themself???
 */
        const struct iphdr *iph = (const struct iphdr *)skb->data;
-        __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
+        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
        int grehlen = (iph->ihl<<2) + 4;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct ip_tunnel *t;
        __be16 flags;
-        __be32 key = 0;
        flags = p[0];
        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
@@ -517,9 +480,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
        if (skb_headlen(skb) < grehlen)
                return;
-        if (flags & GRE_KEY)
-                key = *(((__be32 *)p) + (grehlen / 4) - 1);
        switch (type) {
        default:
        case ICMP_PARAMETERPROB:
@@ -531,6 +491,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
                case ICMP_PORT_UNREACH:
                        /* Impossible event. */
                        return;
+                case ICMP_FRAG_NEEDED:
+                        /* Soft state for pmtu is maintained by IP core. */
+                        return;
                default:
                        /* All others are translated to HOST_UNREACH.
                           rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -543,39 +506,38 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
                if (code != ICMP_EXC_TTL)
                        return;
                break;
-        case ICMP_REDIRECT:
-                break;
        }
+        rcu_read_lock();
        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
-                                flags, key, p[1]);
+                                flags & GRE_KEY ?
+                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
-        if (t == NULL)
+                                p[1]);
-                return;
+        if (t == NULL || t->parms.iph.daddr == 0 ||
-        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-                                 t->parms.link, 0, IPPROTO_GRE, 0);
-                return;
-        }
-        if (type == ICMP_REDIRECT) {
-                ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
-                              IPPROTO_GRE, 0);
-                return;
-        }
-        if (t->parms.iph.daddr == 0 ||
            ipv4_is_multicast(t->parms.iph.daddr))
-                return;
+                goto out;
        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-                return;
+                goto out;
        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
                t->err_count++;
        else
                t->err_count = 1;
        t->err_time = jiffies;
+out:
+        rcu_read_unlock();
+}
+static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
+{
+        if (INET_ECN_is_ce(iph->tos)) {
+                if (skb->protocol == htons(ETH_P_IP)) {
+                        IP_ECN_set_ce(ip_hdr(skb));
+                } else if (skb->protocol == htons(ETH_P_IPV6)) {
+                        IP6_ECN_set_ce(ipv6_hdr(skb));
+                }
+        }
 }
 static inline u8
@@ -600,21 +562,20 @@ static int ipgre_rcv(struct sk_buff *skb)
        struct ip_tunnel *tunnel;
        int    offset = 4;
        __be16 gre_proto;
-        int    err;
        if (!pskb_may_pull(skb, 16))
-                goto drop;
+                goto drop_nolock;
        iph = ip_hdr(skb);
        h = skb->data;
-        flags = *(__be16 *)h;
+        flags = *(__be16*)h;
        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
                /* - Version must be 0.
                   - We do not support routing headers.
                 */
                if (flags&(GRE_VERSION|GRE_ROUTING))
-                        goto drop;
+                        goto drop_nolock;
                if (flags&GRE_CSUM) {
                        switch (skb->ip_summed) {
@@ -631,21 +592,21 @@ static int ipgre_rcv(struct sk_buff *skb)
                        offset += 4;
                }
                if (flags&GRE_KEY) {
-                        key = *(__be32 *)(h + offset);
+                        key = *(__be32*)(h + offset);
                        offset += 4;
                }
                if (flags&GRE_SEQ) {
-                        seqno = ntohl(*(__be32 *)(h + offset));
+                        seqno = ntohl(*(__be32*)(h + offset));
                        offset += 4;
                }
        }
        gre_proto = *(__be16 *)(h + 2);
-        tunnel = ipgre_tunnel_lookup(skb->dev,
+        rcu_read_lock();
-                                     iph->saddr, iph->daddr, flags, key,
+        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
-                                     gre_proto);
+                                          iph->saddr, iph->daddr, key,
-        if (tunnel) {
+                                          gre_proto))) {
                struct pcpu_tstats *tstats;
                secpath_reset(skb);
@@ -704,33 +665,25 @@ static int ipgre_rcv(struct sk_buff *skb)
                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
                }
+                tstats = this_cpu_ptr(tunnel->dev->tstats);
+                tstats->rx_packets++;
+                tstats->rx_bytes += skb->len;
                __skb_tunnel_rx(skb, tunnel->dev);
                skb_reset_network_header(skb);
-                err = IP_ECN_decapsulate(iph, skb);
+                ipgre_ecn_decapsulate(iph, skb);
-                if (unlikely(err)) {
-                        if (log_ecn_error)
-                                net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
-                                                     &iph->saddr, iph->tos);
-                        if (err > 1) {
-                                ++tunnel->dev->stats.rx_frame_errors;
-                                ++tunnel->dev->stats.rx_errors;
-                                goto drop;
-                        }
-                }
-                tstats = this_cpu_ptr(tunnel->dev->tstats);
+                netif_rx(skb);
-                u64_stats_update_begin(&tstats->syncp);
-                tstats->rx_packets++;
-                tstats->rx_bytes += skb->len;
-                u64_stats_update_end(&tstats->syncp);
-                gro_cells_receive(&tunnel->gro_cells, skb);
+                rcu_read_unlock();
                return 0;
        }
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 drop:
+        rcu_read_unlock();
+drop_nolock:
        kfree_skb(skb);
        return 0;
 }
@@ -738,6 +691,7 @@ drop:
 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct pcpu_tstats *tstats;
        const struct iphdr  *old_iph = ip_hdr(skb);
        const struct iphdr  *tiph;
        struct flowi4 fl4;
@@ -750,21 +704,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
        int    gre_hlen;
        __be32 dst;
        int    mtu;
-        u8     ttl;
-        if (skb->ip_summed == CHECKSUM_PARTIAL &&
-            skb_checksum_help(skb))
-                goto tx_error;
        if (dev->type == ARPHRD_ETHER)
                IPCB(skb)->flags = 0;
        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
                gre_hlen = 0;
-                if (skb->protocol == htons(ETH_P_IP))
+                tiph = (const struct iphdr *)skb->data;
-                        tiph = (const struct iphdr *)skb->data;
-                else
-                        tiph = &tunnel->parms.iph;
        } else {
                gre_hlen = tunnel->hlen;
                tiph = &tunnel->parms.iph;
@@ -780,16 +726,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                if (skb->protocol == htons(ETH_P_IP)) {
                        rt = skb_rtable(skb);
-                        dst = rt_nexthop(rt, old_iph->daddr);
+                        if ((dst = rt->rt_gateway) == 0)
+                                goto tx_error_icmp;
                }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                else if (skb->protocol == htons(ETH_P_IPV6)) {
+                        struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
                        const struct in6_addr *addr6;
-                        struct neighbour *neigh;
-                        bool do_tx_error_icmp;
                        int addr_type;
-                        neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
                        if (neigh == NULL)
                                goto tx_error;
@@ -802,21 +747,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                        }
                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
-                                do_tx_error_icmp = true;
-                        else {
-                                do_tx_error_icmp = false;
-                                dst = addr6->s6_addr32[3];
-                        }
-                        neigh_release(neigh);
-                        if (do_tx_error_icmp)
                                goto tx_error_icmp;
+                        dst = addr6->s6_addr32[3];
                }
 #endif
                else
                        goto tx_error;
        }
-        ttl = tiph->ttl;
        tos = tiph->tos;
        if (tos == 1) {
                tos = 0;
@@ -848,7 +787,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
        if (skb_dst(skb))
-                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
        if (skb->protocol == htons(ETH_P_IP)) {
                df |= (old_iph->frag_off&htons(IP_DF));
@@ -860,7 +799,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                        goto tx_error;
                }
        }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        else if (skb->protocol == htons(ETH_P_IPV6)) {
                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
@@ -909,12 +848,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                dev_kfree_skb(skb);
                skb = new_skb;
                old_iph = ip_hdr(skb);
-                /* Warning : tiph value might point to freed memory */
        }
+        skb_reset_transport_header(skb);
        skb_push(skb, gre_hlen);
        skb_reset_network_header(skb);
-        skb_set_transport_header(skb, sizeof(*iph));
        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
                              IPSKB_REROUTED);
@@ -933,12 +871,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
        iph->daddr              =       fl4.daddr;
        iph->saddr              =       fl4.saddr;
-        iph->ttl                =       ttl;
-        if (ttl == 0) {
+        if ((iph->ttl = tiph->ttl) == 0) {
                if (skb->protocol == htons(ETH_P_IP))
                        iph->ttl = old_iph->ttl;
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                else if (skb->protocol == htons(ETH_P_IPV6))
                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 #endif
@@ -951,7 +888,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                                   htons(ETH_P_TEB) : skb->protocol;
        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
-                __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
+                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
                if (tunnel->parms.o_flags&GRE_SEQ) {
                        ++tunnel->o_seqno;
@@ -964,17 +901,18 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
                }
                if (tunnel->parms.o_flags&GRE_CSUM) {
                        *ptr = 0;
-                        *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
+                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
                }
        }
-        iptunnel_xmit(skb, dev);
+        nf_reset(skb);
+        tstats = this_cpu_ptr(dev->tstats);
+        __IPTUNNEL_XMIT(tstats, &dev->stats);
        return NETDEV_TX_OK;
-#if IS_ENABLED(CONFIG_IPV6)
 tx_error_icmp:
        dst_link_failure(skb);
-#endif
 tx_error:
        dev->stats.tx_errors++;
        dev_kfree_skb(skb);
@@ -1071,7 +1009,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
        case SIOCADDTUNNEL:
        case SIOCCHGTUNNEL:
                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        goto done;
                err = -EFAULT;
@@ -1146,7 +1084,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
        case SIOCDELTUNNEL:
                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        goto done;
                if (dev == ign->fb_tunnel_dev) {
@@ -1218,7 +1156,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 {
        struct ip_tunnel *t = netdev_priv(dev);
        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
-        __be16 *p = (__be16 *)(iph+1);
+        __be16 *p = (__be16*)(iph+1);
        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
        p[0]            = t->parms.o_flags;
@@ -1302,23 +1240,15 @@ static const struct net_device_ops ipgre_netdev_ops = {
        .ndo_start_xmit         = ipgre_tunnel_xmit,
        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
-        .ndo_get_stats64        = ipgre_get_stats64,
+        .ndo_get_stats          = ipgre_get_stats,
 };
 static void ipgre_dev_free(struct net_device *dev)
 {
-        struct ip_tunnel *tunnel = netdev_priv(dev);
-        gro_cells_destroy(&tunnel->gro_cells);
        free_percpu(dev->tstats);
        free_netdev(dev);
 }
-#define GRE_FEATURES (NETIF_F_SG |              \
-                      NETIF_F_FRAGLIST |        \
-                      NETIF_F_HIGHDMA |         \
-                      NETIF_F_HW_CSUM)
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
        dev->netdev_ops         = &ipgre_netdev_ops;
@@ -1332,16 +1262,12 @@ static void ipgre_tunnel_setup(struct net_device *dev)
        dev->addr_len           = 4;
        dev->features           |= NETIF_F_NETNS_LOCAL;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
-        dev->features           |= GRE_FEATURES;
-        dev->hw_features        |= GRE_FEATURES;
 }
 static int ipgre_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel;
        struct iphdr *iph;
-        int err;
        tunnel = netdev_priv(dev);
        iph = &tunnel->parms.iph;
@@ -1368,12 +1294,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
        if (!dev->tstats)
                return -ENOMEM;
-        err = gro_cells_init(&tunnel->gro_cells, dev);
-        if (err) {
-                free_percpu(dev->tstats);
-                return err;
-        }
        return 0;
 }
@@ -1574,7 +1494,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
        .ndo_set_mac_address    = eth_mac_addr,
        .ndo_validate_addr      = eth_validate_addr,
        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
-        .ndo_get_stats64        = ipgre_get_stats64,
+        .ndo_get_stats          = ipgre_get_stats,
 };
 static void ipgre_tap_setup(struct net_device *dev)
@@ -1605,7 +1525,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
                return -EEXIST;
        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
-                eth_hw_addr_random(dev);
+                random_ether_addr(dev->dev_addr);
        mtu = ipgre_tunnel_bind_dev(dev);
        if (!tb[IFLA_MTU])
@@ -1721,18 +1641,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
        struct ip_tunnel *t = netdev_priv(dev);
        struct ip_tunnel_parm *p = &t->parms;
-        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
-            nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
+        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
-            nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
-            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
-            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
-            nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
-            nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
-            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
-            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
-            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
-                       !!(p->iph.frag_off & htons(IP_DF))))
-                goto nla_put_failure;
        return 0;
 nla_put_failure:
@@ -1786,7 +1705,7 @@ static int __init ipgre_init(void)
 {
        int err;
-        pr_info("GRE over IPv4 tunneling driver\n");
+        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
        err = register_pernet_device(&ipgre_net_ops);
        if (err < 0)
@@ -1794,7 +1713,7 @@ static int __init ipgre_init(void)
        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
        if (err < 0) {
-                pr_info("%s: can't add protocol\n", __func__);
+                printk(KERN_INFO "ipgre init: can't add protocol\n");
                goto add_proto_failed;
        }
@@ -1823,7 +1742,7 @@ static void __exit ipgre_fini(void)
        rtnl_link_unregister(&ipgre_tap_ops);
        rtnl_link_unregister(&ipgre_link_ops);
        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
-                pr_info("%s: can't remove protocol\n", __func__);
+                printk(KERN_INFO "ipgre close: can't remove protocol\n");
        unregister_pernet_device(&ipgre_net_ops);
 }
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f1395a6fb35..073a9b01c40 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -113,8 +113,7 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define pr_fmt(fmt) "IPv4: " fmt
+#include <asm/system.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -149,7 +148,7 @@
 /*
 *      Process Router Attention IP option (RFC 2113)
 */
-bool ip_call_ra_chain(struct sk_buff *skb)
+int ip_call_ra_chain(struct sk_buff *skb)
 {
        struct ip_ra_chain *ra;
        u8 protocol = ip_hdr(skb)->protocol;
@@ -168,7 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
                    net_eq(sock_net(sk), dev_net(dev))) {
                        if (ip_is_fragment(ip_hdr(skb))) {
                                if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
-                                        return true;
+                                        return 1;
                        }
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -181,9 +180,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
        if (last) {
                raw_rcv(last, skb);
-                return true;
+                return 1;
        }
-        return false;
+        return 0;
 }
 static int ip_local_deliver_finish(struct sk_buff *skb)
@@ -198,19 +197,21 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
        rcu_read_lock();
        {
                int protocol = ip_hdr(skb)->protocol;
+                int hash, raw;
                const struct net_protocol *ipprot;
-                int raw;
        resubmit:
                raw = raw_local_deliver(skb, protocol);
-                ipprot = rcu_dereference(inet_protos[protocol]);
+                hash = protocol & (MAX_INET_PROTOS - 1);
+                ipprot = rcu_dereference(inet_protos[hash]);
                if (ipprot != NULL) {
                        int ret;
                        if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
-                                net_info_ratelimited("%s: proto %d isn't netns-ready\n",
+                                if (net_ratelimit())
-                                                     __func__, protocol);
+                                        printk("%s: proto %d isn't netns-ready\n",
+                                                __func__, protocol);
                                kfree_skb(skb);
                                goto out;
                        }
@@ -264,7 +265,7 @@ int ip_local_deliver(struct sk_buff *skb)
                       ip_local_deliver_finish);
 }
-static inline bool ip_rcv_options(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
 {
        struct ip_options *opt;
        const struct iphdr *iph;
@@ -296,10 +297,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
                if (in_dev) {
                        if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
-                                if (IN_DEV_LOG_MARTIANS(in_dev))
+                                if (IN_DEV_LOG_MARTIANS(in_dev) &&
-                                        net_info_ratelimited("source route option %pI4 -> %pI4\n",
+                                    net_ratelimit())
-                                                             &iph->saddr,
+                                        printk(KERN_INFO "source route option %pI4 -> %pI4\n",
-                                                             &iph->daddr);
+                                               &iph->saddr, &iph->daddr);
                                goto drop;
                        }
                }
@@ -308,40 +309,31 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
                        goto drop;
        }
-        return false;
+        return 0;
 drop:
-        return true;
+        return -1;
 }
-int sysctl_ip_early_demux __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_ip_early_demux);
 static int ip_rcv_finish(struct sk_buff *skb)
 {
        const struct iphdr *iph = ip_hdr(skb);
        struct rtable *rt;
-        if (sysctl_ip_early_demux && !skb_dst(skb)) {
-                const struct net_protocol *ipprot;
-                int protocol = iph->protocol;
-                ipprot = rcu_dereference(inet_protos[protocol]);
-                if (ipprot && ipprot->early_demux) {
-                        ipprot->early_demux(skb);
-                        /* must reload iph, skb->head might have changed */
-                        iph = ip_hdr(skb);
-                }
-        }
        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
         */
-        if (!skb_dst(skb)) {
+        if (skb_dst(skb) == NULL) {
                int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                                               iph->tos, skb->dev);
                if (unlikely(err)) {
-                        if (err == -EXDEV)
+                        if (err == -EHOSTUNREACH)
+                                IP_INC_STATS_BH(dev_net(skb->dev),
+                                                IPSTATS_MIB_INADDRERRORS);
+                        else if (err == -ENETUNREACH)
+                                IP_INC_STATS_BH(dev_net(skb->dev),
+                                                IPSTATS_MIB_INNOROUTES);
+                        else if (err == -EXDEV)
                                NET_INC_STATS_BH(dev_net(skb->dev),
                                                 LINUX_MIB_IPRPFILTER);
                        goto drop;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index f6289bf6f33..05d20cca9d6 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -9,8 +9,6 @@
 *
 */
-#define pr_fmt(fmt) "IPv4: " fmt
 #include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -27,7 +25,6 @@
 #include <net/icmp.h>
 #include <net/route.h>
 #include <net/cipso_ipv4.h>
-#include <net/ip_fib.h>
 /*
 * Write options to IP header, record destination address to
@@ -93,6 +90,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
        unsigned char *sptr, *dptr;
        int soffset, doffset;
        int     optlen;
+        __be32  daddr;
        memset(dopt, 0, sizeof(struct ip_options));
@@ -104,6 +102,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
        sptr = skb_network_header(skb);
        dptr = dopt->__data;
+        daddr = skb_rtable(skb)->rt_spec_dst;
        if (sopt->rr) {
                optlen  = sptr[sopt->rr+1];
                soffset = sptr[sopt->rr+2];
@@ -177,8 +177,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
                                doffset -= 4;
                }
                if (doffset > 3) {
-                        __be32 daddr = fib_compute_spec_dst(skb);
                        memcpy(&start[doffset-1], &daddr, 4);
                        dopt->faddr = faddr;
                        dptr[0] = start[0];
@@ -210,10 +208,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
 *      Simple and stupid 8), but the most efficient way.
 */
-void ip_options_fragment(struct sk_buff *skb)
+void ip_options_fragment(struct sk_buff * skb)
 {
        unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
-        struct ip_options *opt = &(IPCB(skb)->opt);
+        struct ip_options * opt = &(IPCB(skb)->opt);
        int  l = opt->optlen;
        int  optlen;
@@ -241,15 +239,6 @@ void ip_options_fragment(struct sk_buff *skb)
        opt->ts_needtime = 0;
 }
-/* helper used by ip_options_compile() to call fib_compute_spec_dst()
- * at most one time.
- */
-static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
-{
-        if (*spec_dst == htonl(INADDR_ANY))
-                *spec_dst = fib_compute_spec_dst(skb);
-}
 /*
 * Verify options and fill pointers in struct options.
 * Caller should clear *opt, and set opt->data.
@@ -257,14 +246,14 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
 */
 int ip_options_compile(struct net *net,
-                       struct ip_options *opt, struct sk_buff *skb)
+                       struct ip_options * opt, struct sk_buff * skb)
 {
-        __be32 spec_dst = htonl(INADDR_ANY);
+        int l;
-        unsigned char *pp_ptr = NULL;
+        unsigned char * iph;
+        unsigned char * optptr;
+        int optlen;
+        unsigned char * pp_ptr = NULL;
        struct rtable *rt = NULL;
-        unsigned char *optptr;
-        unsigned char *iph;
-        int optlen, l;
        if (skb != NULL) {
                rt = skb_rtable(skb);
@@ -340,8 +329,7 @@ int ip_options_compile(struct net *net,
                                        goto error;
                                }
                                if (rt) {
-                                        spec_dst_fill(&spec_dst, skb);
+                                        memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
-                                        memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
                                        opt->is_changed = 1;
                                }
                                optptr[2] += 4;
@@ -383,8 +371,7 @@ int ip_options_compile(struct net *net,
                                        }
                                        opt->ts = optptr - iph;
                                        if (rt)  {
-                                                spec_dst_fill(&spec_dst, skb);
+                                                memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
-                                                memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
                                                timeptr = &optptr[optptr[2]+3];
                                        }
                                        opt->ts_needaddr = 1;
@@ -409,7 +396,7 @@ int ip_options_compile(struct net *net,
                                        optptr[2] += 8;
                                        break;
                                      default:
-                                        if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+                                        if (!skb && !capable(CAP_NET_RAW)) {
                                                pp_ptr = optptr + 3;
                                                goto error;
                                        }
@@ -424,7 +411,7 @@ int ip_options_compile(struct net *net,
                                        opt->is_changed = 1;
                                }
                        } else {
-                                unsigned int overflow = optptr[3]>>4;
+                                unsigned overflow = optptr[3]>>4;
                                if (overflow == 15) {
                                        pp_ptr = optptr + 3;
                                        goto error;
@@ -445,7 +432,7 @@ int ip_options_compile(struct net *net,
                                opt->router_alert = optptr - iph;
                        break;
                      case IPOPT_CIPSO:
-                        if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
+                        if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
                                pp_ptr = optptr;
                                goto error;
                        }
@@ -458,7 +445,7 @@ int ip_options_compile(struct net *net,
                      case IPOPT_SEC:
                      case IPOPT_SID:
                      default:
-                        if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+                        if (!skb && !capable(CAP_NET_RAW)) {
                                pp_ptr = optptr;
                                goto error;
                        }
@@ -484,20 +471,20 @@ EXPORT_SYMBOL(ip_options_compile);
 *      Undo all the changes done by ip_options_compile().
 */
-void ip_options_undo(struct ip_options *opt)
+void ip_options_undo(struct ip_options * opt)
 {
        if (opt->srr) {
-                unsigned  char *optptr = opt->__data+opt->srr-sizeof(struct  iphdr);
+                unsigned  char * optptr = opt->__data+opt->srr-sizeof(struct  iphdr);
                memmove(optptr+7, optptr+3, optptr[1]-7);
                memcpy(optptr+3, &opt->faddr, 4);
        }
        if (opt->rr_needaddr) {
-                unsigned  char *optptr = opt->__data+opt->rr-sizeof(struct  iphdr);
+                unsigned  char * optptr = opt->__data+opt->rr-sizeof(struct  iphdr);
                optptr[2] -= 4;
                memset(&optptr[optptr[2]-1], 0, 4);
        }
        if (opt->ts) {
-                unsigned  char *optptr = opt->__data+opt->ts-sizeof(struct  iphdr);
+                unsigned  char * optptr = opt->__data+opt->ts-sizeof(struct  iphdr);
                if (opt->ts_needtime) {
                        optptr[2] -= 4;
                        memset(&optptr[optptr[2]-1], 0, 4);
@@ -560,8 +547,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp,
 void ip_forward_options(struct sk_buff *skb)
 {
-        struct   ip_options *opt        = &(IPCB(skb)->opt);
+        struct   ip_options * opt       = &(IPCB(skb)->opt);
-        unsigned char *optptr;
+        unsigned char * optptr;
        struct rtable *rt = skb_rtable(skb);
        unsigned char *raw = skb_network_header(skb);
@@ -581,18 +568,15 @@ void ip_forward_options(struct sk_buff *skb)
                     ) {
                        if (srrptr + 3 > srrspace)
                                break;
-                        if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
+                        if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0)
                                break;
                }
                if (srrptr + 3 <= srrspace) {
                        opt->is_changed = 1;
-                        ip_hdr(skb)->daddr = opt->nexthop;
                        ip_rt_get_source(&optptr[srrptr-1], skb, rt);
                        optptr[2] = srrptr+4;
-                } else {
+                } else if (net_ratelimit())
-                        net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+                        printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
-                                             __func__);
-                }
                if (opt->ts_needaddr) {
                        optptr = raw + opt->ts;
                        ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
@@ -656,7 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
        }
        if (srrptr <= srrspace) {
                opt->srr_is_hit = 1;
-                opt->nexthop = nexthop;
+                iph->daddr = nexthop;
                opt->is_changed = 1;
        }
        return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3e98ed2bff5..8c6563361ab 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -43,6 +43,7 @@
 */
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -113,6 +114,19 @@ int ip_local_out(struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip_local_out);
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+        skb_reset_mac_header(newskb);
+        __skb_pull(newskb, skb_network_offset(newskb));
+        newskb->pkt_type = PACKET_LOOPBACK;
+        newskb->ip_summed = CHECKSUM_UNNECESSARY;
+        WARN_ON(!skb_dst(newskb));
+        skb_dst_force(newskb);
+        netif_rx_ni(newskb);
+        return 0;
+}
 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 {
        int ttl = inet->uc_ttl;
@@ -170,7 +184,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
        struct net_device *dev = dst->dev;
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
-        u32 nexthop;
        if (rt->rt_type == RTN_MULTICAST) {
                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -188,25 +201,22 @@ static inline int ip_finish_output2(struct sk_buff *skb)
                }
                if (skb->sk)
                        skb_set_owner_w(skb2, skb->sk);
-                consume_skb(skb);
+                kfree_skb(skb);
                skb = skb2;
        }
-        rcu_read_lock_bh();
+        rcu_read_lock();
-        nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
+        neigh = dst_get_neighbour(dst);
-        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+        if (neigh) {
-        if (unlikely(!neigh))
+                int res = neigh_output(neigh, skb);
-                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
-        if (!IS_ERR(neigh)) {
-                int res = dst_neigh_output(dst, neigh, skb);
-                rcu_read_unlock_bh();
+                rcu_read_unlock();
                return res;
        }
-        rcu_read_unlock_bh();
+        rcu_read_unlock();
-        net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+        if (net_ratelimit())
-                            __func__);
+                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
        kfree_skb(skb);
        return -EINVAL;
 }
@@ -272,7 +282,7 @@ int ip_mc_output(struct sk_buff *skb)
                        if (newskb)
                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                        newskb, NULL, newskb->dev,
-                                        dev_loopback_xmit);
+                                        ip_dev_loopback_xmit);
                }
                /* Multicasts with ttl 0 must not go beyond the host */
@@ -287,7 +297,7 @@ int ip_mc_output(struct sk_buff *skb)
                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                if (newskb)
                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
-                                NULL, newskb->dev, dev_loopback_xmit);
+                                NULL, newskb->dev, ip_dev_loopback_xmit);
        }
        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -309,20 +319,6 @@ int ip_output(struct sk_buff *skb)
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-/*
- * copy saddr and daddr, possibly using 64bit load/stores
- * Equivalent to :
- *   iph->saddr = fl4->saddr;
- *   iph->daddr = fl4->daddr;
- */
-static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
-{
-        BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
-                     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
-        memcpy(&iph->saddr, &fl4->saddr,
-               sizeof(fl4->saddr) + sizeof(fl4->daddr));
-}
 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 {
        struct sock *sk = skb->sk;
@@ -371,7 +367,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
        skb_dst_set_noref(skb, &rt->dst);
 packet_routed:
-        if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+        if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
                goto no_route;
        /* OK, we know where to send it, allocate and build IP header. */
@@ -385,8 +381,8 @@ packet_routed:
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->protocol = sk->sk_protocol;
-        ip_copy_addrs(iph, fl4);
+        iph->saddr    = fl4->saddr;
+        iph->daddr    = fl4->daddr;
        /* Transport layer set skb->h.foo itself. */
        if (inet_opt && inet_opt->opt.optlen) {
@@ -467,9 +463,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
        iph = ip_hdr(skb);
-        if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
+        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
-                     (IPCB(skb)->frag_max_size &&
-                      IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(ip_skb_dst_mtu(skb)));
@@ -595,10 +589,6 @@ slow_path_clean:
        }
 slow_path:
-        /* for offloaded checksums cleanup checksum before fragmentation */
-        if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
-                goto fail;
        left = skb->len - hlen;         /* Space per frame */
        ptr = hlen;             /* Where to start from */
@@ -706,7 +696,7 @@ slow_path:
                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
        }
-        consume_skb(skb);
+        kfree_skb(skb);
        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
        return err;
@@ -797,7 +787,6 @@ static int __ip_append_data(struct sock *sk,
                            struct flowi4 *fl4,
                            struct sk_buff_head *queue,
                            struct inet_cork *cork,
-                            struct page_frag *pfrag,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
@@ -992,30 +981,46 @@ alloc_new_skb:
                        }
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
+                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-                        err = -ENOMEM;
+                        struct page *page = cork->page;
-                        if (!sk_page_frag_refill(sk, pfrag))
+                        int off = cork->off;
-                                goto error;
+                        unsigned int left;
-                        if (!skb_can_coalesce(skb, i, pfrag->page,
+                        if (page && (left = PAGE_SIZE - off) > 0) {
-                                              pfrag->offset)) {
+                                if (copy >= left)
-                                err = -EMSGSIZE;
+                                        copy = left;
-                                if (i == MAX_SKB_FRAGS)
+                                if (page != frag->page) {
+                                        if (i == MAX_SKB_FRAGS) {
+                                                err = -EMSGSIZE;
+                                                goto error;
+                                        }
+                                        get_page(page);
+                                        skb_fill_page_desc(skb, i, page, off, 0);
+                                        frag = &skb_shinfo(skb)->frags[i];
+                                }
+                        } else if (i < MAX_SKB_FRAGS) {
+                                if (copy > PAGE_SIZE)
+                                        copy = PAGE_SIZE;
+                                page = alloc_pages(sk->sk_allocation, 0);
+                                if (page == NULL)  {
+                                        err = -ENOMEM;
                                        goto error;
+                                }
+                                cork->page = page;
+                                cork->off = 0;
-                                __skb_fill_page_desc(skb, i, pfrag->page,
+                                skb_fill_page_desc(skb, i, page, 0, 0);
-                                                     pfrag->offset, 0);
+                                frag = &skb_shinfo(skb)->frags[i];
-                                skb_shinfo(skb)->nr_frags = ++i;
+                        } else {
-                                get_page(pfrag->page);
+                                err = -EMSGSIZE;
+                                goto error;
                        }
-                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
+                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
-                        if (getfrag(from,
+                                err = -EFAULT;
-                                    page_address(pfrag->page) + pfrag->offset,
+                                goto error;
-                                    offset, copy, skb->len, skb) < 0)
+                        }
-                                goto error_efault;
+                        cork->off += copy;
+                        frag->size += copy;
-                        pfrag->offset += copy;
-                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        skb->len += copy;
                        skb->data_len += copy;
                        skb->truesize += copy;
@@ -1027,8 +1032,6 @@ alloc_new_skb:
        return 0;
-error_efault:
-        err = -EFAULT;
 error:
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1069,6 +1072,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
        cork->dst = &rt->dst;
        cork->length = 0;
        cork->tx_flags = ipc->tx_flags;
+        cork->page = NULL;
+        cork->off = 0;
        return 0;
 }
@@ -1105,8 +1110,7 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                transhdrlen = 0;
        }
-        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
-                                sk_page_frag(sk), getfrag,
                                from, length, transhdrlen, flags);
 }
@@ -1225,7 +1229,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
                if (len > size)
                        len = size;
                if (skb_can_coalesce(skb, i, page, offset)) {
-                        skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
+                        skb_shinfo(skb)->frags[i-1].size += len;
                } else if (i < MAX_SKB_FRAGS) {
                        get_page(page);
                        skb_fill_page_desc(skb, i, page, offset, len);
@@ -1329,10 +1333,11 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
        iph->ihl = 5;
        iph->tos = inet->tos;
        iph->frag_off = df;
+        ip_select_ident(iph, &rt->dst, sk);
        iph->ttl = ttl;
        iph->protocol = sk->sk_protocol;
-        ip_copy_addrs(iph, fl4);
+        iph->saddr = fl4->saddr;
-        ip_select_ident(iph, &rt->dst, sk);
+        iph->daddr = fl4->daddr;
        if (opt) {
                iph->ihl += opt->optlen>>2;
@@ -1357,8 +1362,9 @@ out:
        return skb;
 }
-int ip_send_skb(struct net *net, struct sk_buff *skb)
+int ip_send_skb(struct sk_buff *skb)
 {
+        struct net *net = sock_net(skb->sk);
        int err;
        err = ip_local_out(skb);
@@ -1381,7 +1387,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
                return 0;
        /* Netfilter gets whole the not fragmented skb. */
-        return ip_send_skb(sock_net(sk), skb);
+        return ip_send_skb(skb);
 }
 /*
@@ -1428,8 +1434,7 @@ struct sk_buff *ip_make_skb(struct sock *sk,
        if (err)
                return ERR_PTR(err);
-        err = __ip_append_data(sk, fl4, &queue, &cork,
+        err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
-                               &current->task_frag, getfrag,
                               from, length, transhdrlen, flags);
        if (err) {
                __ip_flush_pending_frames(sk, &queue, &cork);
@@ -1454,34 +1459,19 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 /*
 *      Generic function to send a packet as reply to another packet.
- *      Used to send some TCP resets/acks so far.
+ *      Used to send TCP resets so far. ICMP should use this function too.
 *
- *      Use a fake percpu inet socket to avoid false sharing and contention.
+ *      Should run single threaded per socket because it uses the sock
+ *      structure to pass arguments.
 */
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
-        .sk = {
+                   struct ip_reply_arg *arg, unsigned int len)
-                .__sk_common = {
-                        .skc_refcnt = ATOMIC_INIT(1),
-                },
-                .sk_wmem_alloc  = ATOMIC_INIT(1),
-                .sk_allocation  = GFP_ATOMIC,
-                .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
-        },
-        .pmtudisc       = IP_PMTUDISC_WANT,
-        .uc_ttl         = -1,
-};
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
-                           __be32 saddr, const struct ip_reply_arg *arg,
-                           unsigned int len)
 {
+        struct inet_sock *inet = inet_sk(sk);
        struct ip_options_data replyopts;
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
-        struct sk_buff *nskb;
-        struct sock *sk;
-        struct inet_sock *inet;
        if (ip_options_echo(&replyopts.opt.opt, skb))
                return;
@@ -1498,41 +1488,39 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
        }
        flowi4_init_output(&fl4, arg->bound_dev_if, 0,
-                           RT_TOS(arg->tos),
+                           RT_TOS(ip_hdr(skb)->tos),
-                           RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
+                           RT_SCOPE_UNIVERSE, sk->sk_protocol,
                           ip_reply_arg_flowi_flags(arg),
-                           daddr, saddr,
+                           daddr, rt->rt_spec_dst,
                           tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
-        rt = ip_route_output_key(net, &fl4);
+        rt = ip_route_output_key(sock_net(sk), &fl4);
        if (IS_ERR(rt))
                return;
-        inet = &get_cpu_var(unicast_sock);
+        /* And let IP do all the hard work.
-        inet->tos = arg->tos;
+           This chunk is not reenterable, hence spinlock.
-        sk = &inet->sk;
+           Note that it uses the fact, that this function is called
+           with locally disabled BH and that sk cannot be already spinlocked.
+         */
+        bh_lock_sock(sk);
+        inet->tos = ip_hdr(skb)->tos;
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
-        sock_net_set(sk, net);
-        __skb_queue_head_init(&sk->sk_write_queue);
-        sk->sk_sndbuf = sysctl_wmem_default;
        ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
                       &ipc, &rt, MSG_DONTWAIT);
-        nskb = skb_peek(&sk->sk_write_queue);
+        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
-        if (nskb) {
                if (arg->csumoffset >= 0)
-                        *((__sum16 *)skb_transport_header(nskb) +
+                        *((__sum16 *)skb_transport_header(skb) +
-                          arg->csumoffset) = csum_fold(csum_add(nskb->csum,
+                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
                                                                arg->csum));
-                nskb->ip_summed = CHECKSUM_NONE;
+                skb->ip_summed = CHECKSUM_NONE;
-                skb_orphan(nskb);
-                skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                ip_push_pending_frames(sk, &fl4);
        }
-        put_cpu_var(unicast_sock);
+        bh_unlock_sock(sk);
        ip_rt_put(rt);
 }
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d9c4f113d70..8905e92f896 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -33,14 +33,12 @@
 #include <linux/netfilter.h>
 #include <linux/route.h>
 #include <linux/mroute.h>
-#include <net/inet_ecn.h>
 #include <net/route.h>
 #include <net/xfrm.h>
 #include <net/compat.h>
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #include <net/transp_v6.h>
 #endif
-#include <net/ip_fib.h>
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -56,13 +54,20 @@
 /*
 *      SOL_IP control messages.
 */
-#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))
 static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 {
-        struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
+        struct in_pktinfo info;
+        struct rtable *rt = skb_rtable(skb);
        info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
+        if (rt) {
+                info.ipi_ifindex = rt->rt_iif;
+                info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+        } else {
+                info.ipi_ifindex = 0;
+                info.ipi_spec_dst.s_addr = 0;
+        }
        put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
 }
@@ -91,7 +96,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
 static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
 {
        unsigned char optbuf[sizeof(struct ip_options) + 40];
-        struct ip_options *opt = (struct ip_options *)optbuf;
+        struct ip_options * opt = (struct ip_options *)optbuf;
        if (IPCB(skb)->opt.optlen == 0)
                return;
@@ -148,7 +153,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 {
        struct inet_sock *inet = inet_sk(skb->sk);
-        unsigned int flags = inet->cmsg_flags;
+        unsigned flags = inet->cmsg_flags;
        /* Ordered by supposed usage frequency */
        if (flags & 1)
@@ -446,6 +451,11 @@ out:
 }
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+        kfree(container_of(head, struct ip_options_rcu, rcu));
+}
 /*
 *      Socket option code for IP. This is the end of the line after any
 *      TCP,UDP etc options on an IP socket.
@@ -457,28 +467,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
        struct inet_sock *inet = inet_sk(sk);
        int val = 0, err;
-        switch (optname) {
+        if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
-        case IP_PKTINFO:
+                             (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
-        case IP_RECVTTL:
+                             (1<<IP_RETOPTS) | (1<<IP_TOS) |
-        case IP_RECVOPTS:
+                             (1<<IP_TTL) | (1<<IP_HDRINCL) |
-        case IP_RECVTOS:
+                             (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
-        case IP_RETOPTS:
+                             (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
-        case IP_TOS:
+                             (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
-        case IP_TTL:
+                             (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
-        case IP_HDRINCL:
+            optname == IP_MULTICAST_TTL ||
-        case IP_MTU_DISCOVER:
+            optname == IP_MULTICAST_ALL ||
-        case IP_RECVERR:
+            optname == IP_MULTICAST_LOOP ||
-        case IP_ROUTER_ALERT:
+            optname == IP_RECVORIGDSTADDR) {
-        case IP_FREEBIND:
-        case IP_PASSSEC:
-        case IP_TRANSPARENT:
-        case IP_MINTTL:
-        case IP_NODEFRAG:
-        case IP_UNICAST_IF:
-        case IP_MULTICAST_TTL:
-        case IP_MULTICAST_ALL:
-        case IP_MULTICAST_LOOP:
-        case IP_RECVORIGDSTADDR:
                if (optlen >= sizeof(int)) {
                        if (get_user(val, (int __user *) optval))
                                return -EFAULT;
@@ -514,7 +514,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                                                sock_owned_by_user(sk));
                if (inet->is_icsk) {
                        struct inet_connection_sock *icsk = inet_csk(sk);
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                        if (sk->sk_family == PF_INET ||
                            (!((1 << sk->sk_state) &
                               (TCPF_LISTEN | TCPF_CLOSE)) &&
@@ -525,13 +525,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                                if (opt)
                                        icsk->icsk_ext_hdr_len += opt->opt.optlen;
                                icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                        }
 #endif
                }
                rcu_assign_pointer(inet->inet_opt, opt);
                if (old)
-                        kfree_rcu(old, rcu);
+                        call_rcu(&old->rcu, opt_kfree_rcu);
                break;
        }
        case IP_PKTINFO:
@@ -578,8 +578,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                break;
        case IP_TOS:    /* This sets both TOS and Precedence */
                if (sk->sk_type == SOCK_STREAM) {
-                        val &= ~INET_ECN_MASK;
+                        val &= ~3;
-                        val |= inet->tos & INET_ECN_MASK;
+                        val |= inet->tos & 3;
                }
                if (inet->tos != val) {
                        inet->tos = val;
@@ -590,7 +590,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
        case IP_TTL:
                if (optlen < 1)
                        goto e_inval;
-                if (val != -1 && (val < 1 || val > 255))
+                if (val != -1 && (val < 0 || val > 255))
                        goto e_inval;
                inet->uc_ttl = val;
                break;
@@ -634,35 +634,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                        goto e_inval;
                inet->mc_loop = !!val;
                break;
-        case IP_UNICAST_IF:
-        {
-                struct net_device *dev = NULL;
-                int ifindex;
-                if (optlen != sizeof(int))
-                        goto e_inval;
-                ifindex = (__force int)ntohl((__force __be32)val);
-                if (ifindex == 0) {
-                        inet->uc_index = 0;
-                        err = 0;
-                        break;
-                }
-                dev = dev_get_by_index(sock_net(sk), ifindex);
-                err = -EADDRNOTAVAIL;
-                if (!dev)
-                        break;
-                dev_put(dev);
-                err = -EINVAL;
-                if (sk->sk_bound_dev_if)
-                        break;
-                inet->uc_index = ifindex;
-                err = 0;
-                break;
-        }
        case IP_MULTICAST_IF:
        {
                struct ip_mreqn mreq;
@@ -683,15 +654,10 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                                break;
                } else {
                        memset(&mreq, 0, sizeof(mreq));
-                        if (optlen >= sizeof(struct ip_mreq)) {
+                        if (optlen >= sizeof(struct in_addr) &&
-                                if (copy_from_user(&mreq, optval,
+                            copy_from_user(&mreq.imr_address, optval,
-                                                   sizeof(struct ip_mreq)))
+                                           sizeof(struct in_addr)))
-                                        break;
+                                break;
-                        } else if (optlen >= sizeof(struct in_addr)) {
-                                if (copy_from_user(&mreq.imr_address, optval,
-                                                   sizeof(struct in_addr)))
-                                        break;
-                        }
                }
                if (!mreq.imr_ifindex) {
@@ -989,14 +955,13 @@ mc_msf_out:
        case IP_IPSEC_POLICY:
        case IP_XFRM_POLICY:
                err = -EPERM;
-                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        break;
                err = xfrm_user_policy(sk, optname, optval, optlen);
                break;
        case IP_TRANSPARENT:
-                if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+                if (!capable(CAP_NET_ADMIN)) {
-                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        err = -EPERM;
                        break;
                }
@@ -1026,27 +991,20 @@ e_inval:
 }
 /**
- * ipv4_pktinfo_prepare - transfert some info from rtable to skb
+ * ip_queue_rcv_skb - Queue an skb into sock receive queue
 * @sk: socket
 * @skb: buffer
 *
- * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
- * destination in skb->cb[] before dst drop.
+ * is not set, we drop skb dst entry now, while dst cache line is hot.
- * This way, receiver doesnt make cache line misses to read rtable.
 */
-void ipv4_pktinfo_prepare(struct sk_buff *skb)
+int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-        struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+        if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
+                skb_dst_drop(skb);
-        if (skb_rtable(skb)) {
+        return sock_queue_rcv_skb(sk, skb);
-                pktinfo->ipi_ifindex = inet_iif(skb);
-                pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
-        } else {
-                pktinfo->ipi_ifindex = 0;
-                pktinfo->ipi_spec_dst.s_addr = 0;
-        }
-        skb_dst_drop(skb);
 }
+EXPORT_SYMBOL(ip_queue_rcv_skb);
 int ip_setsockopt(struct sock *sk, int level,
                int optname, char __user *optval, unsigned int optlen)
@@ -1109,7 +1067,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
 */
 static int do_ip_getsockopt(struct sock *sk, int level, int optname,
-                            char __user *optval, int __user *optlen, unsigned int flags)
+                            char __user *optval, int __user *optlen, unsigned flags)
 {
        struct inet_sock *inet = inet_sk(sk);
        int val;
@@ -1218,9 +1176,6 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
        case IP_MULTICAST_LOOP:
                val = inet->mc_loop;
                break;
-        case IP_UNICAST_IF:
-                val = (__force int)htonl((__u32) inet->uc_index);
-                break;
        case IP_MULTICAST_IF:
        {
                struct in_addr addr;
@@ -1299,10 +1254,6 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
                        int hlim = inet->mc_ttl;
                        put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
                }
-                if (inet->cmsg_flags & IP_CMSG_TOS) {
-                        int tos = inet->rcv_tos;
-                        put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
-                }
                len -= msg.msg_controllen;
                return put_user(len, optlen);
        }
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
deleted file mode 100644
index c3a4233c0ac..00000000000
--- a/net/ipv4/ip_vti.c
+++ /dev/null
@@ -1,942 +0,0 @@
-/*
- *      Linux NET3: IP/IP protocol decoder modified to support
- *                  virtual tunnel interface
- *
- *      Authors:
- *              Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- */
-/*
-   This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
-   For comments look at net/ipv4/ip_gre.c --ANK
- */
-#include <linux/capability.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/uaccess.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/if_arp.h>
-#include <linux/mroute.h>
-#include <linux/init.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/if_ether.h>
-#include <net/sock.h>
-#include <net/ip.h>
-#include <net/icmp.h>
-#include <net/ipip.h>
-#include <net/inet_ecn.h>
-#include <net/xfrm.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#define HASH_SIZE  16
-#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
-static struct rtnl_link_ops vti_link_ops __read_mostly;
-static int vti_net_id __read_mostly;
-struct vti_net {
-        struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
-        struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
-        struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
-        struct ip_tunnel __rcu *tunnels_wc[1];
-        struct ip_tunnel __rcu **tunnels[4];
-        struct net_device *fb_tunnel_dev;
-};
-static int vti_fb_tunnel_init(struct net_device *dev);
-static int vti_tunnel_init(struct net_device *dev);
-static void vti_tunnel_setup(struct net_device *dev);
-static void vti_dev_free(struct net_device *dev);
-static int vti_tunnel_bind_dev(struct net_device *dev);
-#define VTI_XMIT(stats1, stats2) do {                           \
-        int err;                                                \
-        int pkt_len = skb->len;                                 \
-        err = dst_output(skb);                                  \
-        if (net_xmit_eval(err) == 0) {                          \
-                u64_stats_update_begin(&(stats1)->syncp);       \
-                (stats1)->tx_bytes += pkt_len;                  \
-                (stats1)->tx_packets++;                         \
-                u64_stats_update_end(&(stats1)->syncp);         \
-        } else {                                                \
-                (stats2)->tx_errors++;                          \
-                (stats2)->tx_aborted_errors++;                  \
-        }                                                       \
-} while (0)
-static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
-                                                 struct rtnl_link_stats64 *tot)
-{
-        int i;
-        for_each_possible_cpu(i) {
-                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
-                unsigned int start;
-                do {
-                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
-                        rx_packets = tstats->rx_packets;
-                        tx_packets = tstats->tx_packets;
-                        rx_bytes = tstats->rx_bytes;
-                        tx_bytes = tstats->tx_bytes;
-                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-                tot->rx_packets += rx_packets;
-                tot->tx_packets += tx_packets;
-                tot->rx_bytes   += rx_bytes;
-                tot->tx_bytes   += tx_bytes;
-        }
-        tot->multicast = dev->stats.multicast;
-        tot->rx_crc_errors = dev->stats.rx_crc_errors;
-        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
-        tot->rx_length_errors = dev->stats.rx_length_errors;
-        tot->rx_errors = dev->stats.rx_errors;
-        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
-        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
-        tot->tx_dropped = dev->stats.tx_dropped;
-        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
-        tot->tx_errors = dev->stats.tx_errors;
-        return tot;
-}
-static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
-                                           __be32 remote, __be32 local)
-{
-        unsigned h0 = HASH(remote);
-        unsigned h1 = HASH(local);
-        struct ip_tunnel *t;
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
-                if (local == t->parms.iph.saddr &&
-                    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
-                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
-                if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
-                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
-                if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
-                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
-                if (t && (t->dev->flags&IFF_UP))
-                        return t;
-        return NULL;
-}
-static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
-                                             struct ip_tunnel_parm *parms)
-{
-        __be32 remote = parms->iph.daddr;
-        __be32 local = parms->iph.saddr;
-        unsigned h = 0;
-        int prio = 0;
-        if (remote) {
-                prio |= 2;
-                h ^= HASH(remote);
-        }
-        if (local) {
-                prio |= 1;
-                h ^= HASH(local);
-        }
-        return &ipn->tunnels[prio][h];
-}
-static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
-                                                  struct ip_tunnel *t)
-{
-        return __vti_bucket(ipn, &t->parms);
-}
-static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
-{
-        struct ip_tunnel __rcu **tp;
-        struct ip_tunnel *iter;
-        for (tp = vti_bucket(ipn, t);
-             (iter = rtnl_dereference(*tp)) != NULL;
-             tp = &iter->next) {
-                if (t == iter) {
-                        rcu_assign_pointer(*tp, t->next);
-                        break;
-                }
-        }
-}
-static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
-{
-        struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
-        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
-        rcu_assign_pointer(*tp, t);
-}
-static struct ip_tunnel *vti_tunnel_locate(struct net *net,
-                                           struct ip_tunnel_parm *parms,
-                                           int create)
-{
-        __be32 remote = parms->iph.daddr;
-        __be32 local = parms->iph.saddr;
-        struct ip_tunnel *t, *nt;
-        struct ip_tunnel __rcu **tp;
-        struct net_device *dev;
-        char name[IFNAMSIZ];
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        for (tp = __vti_bucket(ipn, parms);
-             (t = rtnl_dereference(*tp)) != NULL;
-             tp = &t->next) {
-                if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
-                        return t;
-        }
-        if (!create)
-                return NULL;
-        if (parms->name[0])
-                strlcpy(name, parms->name, IFNAMSIZ);
-        else
-                strcpy(name, "vti%d");
-        dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
-        if (dev == NULL)
-                return NULL;
-        dev_net_set(dev, net);
-        nt = netdev_priv(dev);
-        nt->parms = *parms;
-        dev->rtnl_link_ops = &vti_link_ops;
-        vti_tunnel_bind_dev(dev);
-        if (register_netdevice(dev) < 0)
-                goto failed_free;
-        dev_hold(dev);
-        vti_tunnel_link(ipn, nt);
-        return nt;
-failed_free:
-        free_netdev(dev);
-        return NULL;
-}
-static void vti_tunnel_uninit(struct net_device *dev)
-{
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        vti_tunnel_unlink(ipn, netdev_priv(dev));
-        dev_put(dev);
-}
-static int vti_err(struct sk_buff *skb, u32 info)
-{
-        /* All the routers (except for Linux) return only
-         * 8 bytes of packet payload. It means, that precise relaying of
-         * ICMP in the real Internet is absolutely infeasible.
-         */
-        struct iphdr *iph = (struct iphdr *)skb->data;
-        const int type = icmp_hdr(skb)->type;
-        const int code = icmp_hdr(skb)->code;
-        struct ip_tunnel *t;
-        int err;
-        switch (type) {
-        default:
-        case ICMP_PARAMETERPROB:
-                return 0;
-        case ICMP_DEST_UNREACH:
-                switch (code) {
-                case ICMP_SR_FAILED:
-                case ICMP_PORT_UNREACH:
-                        /* Impossible event. */
-                        return 0;
-                default:
-                        /* All others are translated to HOST_UNREACH. */
-                        break;
-                }
-                break;
-        case ICMP_TIME_EXCEEDED:
-                if (code != ICMP_EXC_TTL)
-                        return 0;
-                break;
-        }
-        err = -ENOENT;
-        t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
-        if (t == NULL)
-                goto out;
-        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
-                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
-                                 t->parms.link, 0, IPPROTO_IPIP, 0);
-                err = 0;
-                goto out;
-        }
-        err = 0;
-        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-                goto out;
-        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
-                t->err_count++;
-        else
-                t->err_count = 1;
-        t->err_time = jiffies;
-out:
-        return err;
-}
-/* We dont digest the packet therefore let the packet pass */
-static int vti_rcv(struct sk_buff *skb)
-{
-        struct ip_tunnel *tunnel;
-        const struct iphdr *iph = ip_hdr(skb);
-        tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
-        if (tunnel != NULL) {
-                struct pcpu_tstats *tstats;
-                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-                        return -1;
-                tstats = this_cpu_ptr(tunnel->dev->tstats);
-                u64_stats_update_begin(&tstats->syncp);
-                tstats->rx_packets++;
-                tstats->rx_bytes += skb->len;
-                u64_stats_update_end(&tstats->syncp);
-                skb->mark = 0;
-                secpath_reset(skb);
-                skb->dev = tunnel->dev;
-                return 1;
-        }
-        return -1;
-}
-/* This function assumes it is being called from dev_queue_xmit()
- * and that skb is filled properly by that function.
- */
-static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct pcpu_tstats *tstats;
-        struct iphdr  *tiph = &tunnel->parms.iph;
-        u8     tos;
-        struct rtable *rt;              /* Route to the other host */
-        struct net_device *tdev;        /* Device to other host */
-        struct iphdr  *old_iph = ip_hdr(skb);
-        __be32 dst = tiph->daddr;
-        struct flowi4 fl4;
-        if (skb->protocol != htons(ETH_P_IP))
-                goto tx_error;
-        tos = old_iph->tos;
-        memset(&fl4, 0, sizeof(fl4));
-        flowi4_init_output(&fl4, tunnel->parms.link,
-                           be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
-                           RT_SCOPE_UNIVERSE,
-                           IPPROTO_IPIP, 0,
-                           dst, tiph->saddr, 0, 0);
-        rt = ip_route_output_key(dev_net(dev), &fl4);
-        if (IS_ERR(rt)) {
-                dev->stats.tx_carrier_errors++;
-                goto tx_error_icmp;
-        }
-        /* if there is no transform then this tunnel is not functional.
-         * Or if the xfrm is not mode tunnel.
-         */
-        if (!rt->dst.xfrm ||
-            rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
-                dev->stats.tx_carrier_errors++;
-                goto tx_error_icmp;
-        }
-        tdev = rt->dst.dev;
-        if (tdev == dev) {
-                ip_rt_put(rt);
-                dev->stats.collisions++;
-                goto tx_error;
-        }
-        if (tunnel->err_count > 0) {
-                if (time_before(jiffies,
-                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
-                        tunnel->err_count--;
-                        dst_link_failure(skb);
-                } else
-                        tunnel->err_count = 0;
-        }
-        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
-                              IPSKB_REROUTED);
-        skb_dst_drop(skb);
-        skb_dst_set(skb, &rt->dst);
-        nf_reset(skb);
-        skb->dev = skb_dst(skb)->dev;
-        tstats = this_cpu_ptr(dev->tstats);
-        VTI_XMIT(tstats, &dev->stats);
-        return NETDEV_TX_OK;
-tx_error_icmp:
-        dst_link_failure(skb);
-tx_error:
-        dev->stats.tx_errors++;
-        dev_kfree_skb(skb);
-        return NETDEV_TX_OK;
-}
-static int vti_tunnel_bind_dev(struct net_device *dev)
-{
-        struct net_device *tdev = NULL;
-        struct ip_tunnel *tunnel;
-        struct iphdr *iph;
-        tunnel = netdev_priv(dev);
-        iph = &tunnel->parms.iph;
-        if (iph->daddr) {
-                struct rtable *rt;
-                struct flowi4 fl4;
-                memset(&fl4, 0, sizeof(fl4));
-                flowi4_init_output(&fl4, tunnel->parms.link,
-                                   be32_to_cpu(tunnel->parms.i_key),
-                                   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-                                   IPPROTO_IPIP, 0,
-                                   iph->daddr, iph->saddr, 0, 0);
-                rt = ip_route_output_key(dev_net(dev), &fl4);
-                if (!IS_ERR(rt)) {
-                        tdev = rt->dst.dev;
-                        ip_rt_put(rt);
-                }
-                dev->flags |= IFF_POINTOPOINT;
-        }
-        if (!tdev && tunnel->parms.link)
-                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
-        if (tdev) {
-                dev->hard_header_len = tdev->hard_header_len +
-                                       sizeof(struct iphdr);
-                dev->mtu = tdev->mtu;
-        }
-        dev->iflink = tunnel->parms.link;
-        return dev->mtu;
-}
-static int
-vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-        int err = 0;
-        struct ip_tunnel_parm p;
-        struct ip_tunnel *t;
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        switch (cmd) {
-        case SIOCGETTUNNEL:
-                t = NULL;
-                if (dev == ipn->fb_tunnel_dev) {
-                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
-                                           sizeof(p))) {
-                                err = -EFAULT;
-                                break;
-                        }
-                        t = vti_tunnel_locate(net, &p, 0);
-                }
-                if (t == NULL)
-                        t = netdev_priv(dev);
-                memcpy(&p, &t->parms, sizeof(p));
-                p.i_flags |= GRE_KEY | VTI_ISVTI;
-                p.o_flags |= GRE_KEY;
-                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-                        err = -EFAULT;
-                break;
-        case SIOCADDTUNNEL:
-        case SIOCCHGTUNNEL:
-                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-                        goto done;
-                err = -EFAULT;
-                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
-                        goto done;
-                err = -EINVAL;
-                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
-                    p.iph.ihl != 5)
-                        goto done;
-                t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-                if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
-                        if (t != NULL) {
-                                if (t->dev != dev) {
-                                        err = -EEXIST;
-                                        break;
-                                }
-                        } else {
-                                if (((dev->flags&IFF_POINTOPOINT) &&
-                                    !p.iph.daddr) ||
-                                    (!(dev->flags&IFF_POINTOPOINT) &&
-                                    p.iph.daddr)) {
-                                        err = -EINVAL;
-                                        break;
-                                }
-                                t = netdev_priv(dev);
-                                vti_tunnel_unlink(ipn, t);
-                                synchronize_net();
-                                t->parms.iph.saddr = p.iph.saddr;
-                                t->parms.iph.daddr = p.iph.daddr;
-                                t->parms.i_key = p.i_key;
-                                t->parms.o_key = p.o_key;
-                                t->parms.iph.protocol = IPPROTO_IPIP;
-                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
-                                memcpy(dev->broadcast, &p.iph.daddr, 4);
-                                vti_tunnel_link(ipn, t);
-                                netdev_state_change(dev);
-                        }
-                }
-                if (t) {
-                        err = 0;
-                        if (cmd == SIOCCHGTUNNEL) {
-                                t->parms.i_key = p.i_key;
-                                t->parms.o_key = p.o_key;
-                                if (t->parms.link != p.link) {
-                                        t->parms.link = p.link;
-                                        vti_tunnel_bind_dev(dev);
-                                        netdev_state_change(dev);
-                                }
-                        }
-                        p.i_flags |= GRE_KEY | VTI_ISVTI;
-                        p.o_flags |= GRE_KEY;
-                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
-                                         sizeof(p)))
-                                err = -EFAULT;
-                } else
-                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
-                break;
-        case SIOCDELTUNNEL:
-                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-                        goto done;
-                if (dev == ipn->fb_tunnel_dev) {
-                        err = -EFAULT;
-                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
-                                           sizeof(p)))
-                                goto done;
-                        err = -ENOENT;
-                        t = vti_tunnel_locate(net, &p, 0);
-                        if (t == NULL)
-                                goto done;
-                        err = -EPERM;
-                        if (t->dev == ipn->fb_tunnel_dev)
-                                goto done;
-                        dev = t->dev;
-                }
-                unregister_netdevice(dev);
-                err = 0;
-                break;
-        default:
-                err = -EINVAL;
-        }
-done:
-        return err;
-}
-static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
-{
-        if (new_mtu < 68 || new_mtu > 0xFFF8)
-                return -EINVAL;
-        dev->mtu = new_mtu;
-        return 0;
-}
-static const struct net_device_ops vti_netdev_ops = {
-        .ndo_init       = vti_tunnel_init,
-        .ndo_uninit     = vti_tunnel_uninit,
-        .ndo_start_xmit = vti_tunnel_xmit,
-        .ndo_do_ioctl   = vti_tunnel_ioctl,
-        .ndo_change_mtu = vti_tunnel_change_mtu,
-        .ndo_get_stats64 = vti_get_stats64,
-};
-static void vti_dev_free(struct net_device *dev)
-{
-        free_percpu(dev->tstats);
-        free_netdev(dev);
-}
-static void vti_tunnel_setup(struct net_device *dev)
-{
-        dev->netdev_ops         = &vti_netdev_ops;
-        dev->destructor         = vti_dev_free;
-        dev->type               = ARPHRD_TUNNEL;
-        dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr);
-        dev->mtu                = ETH_DATA_LEN;
-        dev->flags              = IFF_NOARP;
-        dev->iflink             = 0;
-        dev->addr_len           = 4;
-        dev->features           |= NETIF_F_NETNS_LOCAL;
-        dev->features           |= NETIF_F_LLTX;
-        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
-}
-static int vti_tunnel_init(struct net_device *dev)
-{
-        struct ip_tunnel *tunnel = netdev_priv(dev);
-        tunnel->dev = dev;
-        strcpy(tunnel->parms.name, dev->name);
-        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
-        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
-        dev->tstats = alloc_percpu(struct pcpu_tstats);
-        if (!dev->tstats)
-                return -ENOMEM;
-        return 0;
-}
-static int __net_init vti_fb_tunnel_init(struct net_device *dev)
-{
-        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct iphdr *iph = &tunnel->parms.iph;
-        struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
-        tunnel->dev = dev;
-        strcpy(tunnel->parms.name, dev->name);
-        iph->version            = 4;
-        iph->protocol           = IPPROTO_IPIP;
-        iph->ihl                = 5;
-        dev->tstats = alloc_percpu(struct pcpu_tstats);
-        if (!dev->tstats)
-                return -ENOMEM;
-        dev_hold(dev);
-        rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
-        return 0;
-}
-static struct xfrm_tunnel vti_handler __read_mostly = {
-        .handler        =       vti_rcv,
-        .err_handler    =       vti_err,
-        .priority       =       1,
-};
-static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
-{
-        int prio;
-        for (prio = 1; prio < 4; prio++) {
-                int h;
-                for (h = 0; h < HASH_SIZE; h++) {
-                        struct ip_tunnel *t;
-                        t = rtnl_dereference(ipn->tunnels[prio][h]);
-                        while (t != NULL) {
-                                unregister_netdevice_queue(t->dev, head);
-                                t = rtnl_dereference(t->next);
-                        }
-                }
-        }
-}
-static int __net_init vti_init_net(struct net *net)
-{
-        int err;
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        ipn->tunnels[0] = ipn->tunnels_wc;
-        ipn->tunnels[1] = ipn->tunnels_l;
-        ipn->tunnels[2] = ipn->tunnels_r;
-        ipn->tunnels[3] = ipn->tunnels_r_l;
-        ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
-                                          "ip_vti0",
-                                          vti_tunnel_setup);
-        if (!ipn->fb_tunnel_dev) {
-                err = -ENOMEM;
-                goto err_alloc_dev;
-        }
-        dev_net_set(ipn->fb_tunnel_dev, net);
-        err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
-        if (err)
-                goto err_reg_dev;
-        ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
-        err = register_netdev(ipn->fb_tunnel_dev);
-        if (err)
-                goto err_reg_dev;
-        return 0;
-err_reg_dev:
-        vti_dev_free(ipn->fb_tunnel_dev);
-err_alloc_dev:
-        /* nothing */
-        return err;
-}
-static void __net_exit vti_exit_net(struct net *net)
-{
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        LIST_HEAD(list);
-        rtnl_lock();
-        vti_destroy_tunnels(ipn, &list);
-        unregister_netdevice_many(&list);
-        rtnl_unlock();
-}
-static struct pernet_operations vti_net_ops = {
-        .init = vti_init_net,
-        .exit = vti_exit_net,
-        .id   = &vti_net_id,
-        .size = sizeof(struct vti_net),
-};
-static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
-{
-        return 0;
-}
-static void vti_netlink_parms(struct nlattr *data[],
-                              struct ip_tunnel_parm *parms)
-{
-        memset(parms, 0, sizeof(*parms));
-        parms->iph.protocol = IPPROTO_IPIP;
-        if (!data)
-                return;
-        if (data[IFLA_VTI_LINK])
-                parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
-        if (data[IFLA_VTI_IKEY])
-                parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
-        if (data[IFLA_VTI_OKEY])
-                parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
-        if (data[IFLA_VTI_LOCAL])
-                parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
-        if (data[IFLA_VTI_REMOTE])
-                parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
-}
-static int vti_newlink(struct net *src_net, struct net_device *dev,
-                       struct nlattr *tb[], struct nlattr *data[])
-{
-        struct ip_tunnel *nt;
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        int mtu;
-        int err;
-        nt = netdev_priv(dev);
-        vti_netlink_parms(data, &nt->parms);
-        if (vti_tunnel_locate(net, &nt->parms, 0))
-                return -EEXIST;
-        mtu = vti_tunnel_bind_dev(dev);
-        if (!tb[IFLA_MTU])
-                dev->mtu = mtu;
-        err = register_netdevice(dev);
-        if (err)
-                goto out;
-        dev_hold(dev);
-        vti_tunnel_link(ipn, nt);
-out:
-        return err;
-}
-static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
-                          struct nlattr *data[])
-{
-        struct ip_tunnel *t, *nt;
-        struct net *net = dev_net(dev);
-        struct vti_net *ipn = net_generic(net, vti_net_id);
-        struct ip_tunnel_parm p;
-        int mtu;
-        if (dev == ipn->fb_tunnel_dev)
-                return -EINVAL;
-        nt = netdev_priv(dev);
-        vti_netlink_parms(data, &p);
-        t = vti_tunnel_locate(net, &p, 0);
-        if (t) {
-                if (t->dev != dev)
-                        return -EEXIST;
-        } else {
-                t = nt;
-                vti_tunnel_unlink(ipn, t);
-                t->parms.iph.saddr = p.iph.saddr;
-                t->parms.iph.daddr = p.iph.daddr;
-                t->parms.i_key = p.i_key;
-                t->parms.o_key = p.o_key;
-                if (dev->type != ARPHRD_ETHER) {
-                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
-                        memcpy(dev->broadcast, &p.iph.daddr, 4);
-                }
-                vti_tunnel_link(ipn, t);
-                netdev_state_change(dev);
-        }
-        if (t->parms.link != p.link) {
-                t->parms.link = p.link;
-                mtu = vti_tunnel_bind_dev(dev);
-                if (!tb[IFLA_MTU])
-                        dev->mtu = mtu;
-                netdev_state_change(dev);
-        }
-        return 0;
-}
-static size_t vti_get_size(const struct net_device *dev)
-{
-        return
-                /* IFLA_VTI_LINK */
-                nla_total_size(4) +
-                /* IFLA_VTI_IKEY */
-                nla_total_size(4) +
-                /* IFLA_VTI_OKEY */
-                nla_total_size(4) +
-                /* IFLA_VTI_LOCAL */
-                nla_total_size(4) +
-                /* IFLA_VTI_REMOTE */
-                nla_total_size(4) +
-                0;
-}
-static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
-{
-        struct ip_tunnel *t = netdev_priv(dev);
-        struct ip_tunnel_parm *p = &t->parms;
-        nla_put_u32(skb, IFLA_VTI_LINK, p->link);
-        nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
-        nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
-        nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
-        nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
-        return 0;
-}
-static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
-        [IFLA_VTI_LINK]         = { .type = NLA_U32 },
-        [IFLA_VTI_IKEY]         = { .type = NLA_U32 },
-        [IFLA_VTI_OKEY]         = { .type = NLA_U32 },
-        [IFLA_VTI_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
-        [IFLA_VTI_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
-};
-static struct rtnl_link_ops vti_link_ops __read_mostly = {
-        .kind           = "vti",
-        .maxtype        = IFLA_VTI_MAX,
-        .policy         = vti_policy,
-        .priv_size      = sizeof(struct ip_tunnel),
-        .setup          = vti_tunnel_setup,
-        .validate       = vti_tunnel_validate,
-        .newlink        = vti_newlink,
-        .changelink     = vti_changelink,
-        .get_size       = vti_get_size,
-        .fill_info      = vti_fill_info,
-};
-static int __init vti_init(void)
-{
-        int err;
-        pr_info("IPv4 over IPSec tunneling driver\n");
-        err = register_pernet_device(&vti_net_ops);
-        if (err < 0)
-                return err;
-        err = xfrm4_mode_tunnel_input_register(&vti_handler);
-        if (err < 0) {
-                unregister_pernet_device(&vti_net_ops);
-                pr_info(KERN_INFO "vti init: can't register tunnel\n");
-        }
-        err = rtnl_link_register(&vti_link_ops);
-        if (err < 0)
-                goto rtnl_link_failed;
-        return err;
-rtnl_link_failed:
-        xfrm4_mode_tunnel_input_deregister(&vti_handler);
-        unregister_pernet_device(&vti_net_ops);
-        return err;
-}
-static void __exit vti_fini(void)
-{
-        rtnl_link_unregister(&vti_link_ops);
-        if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
-                pr_info("vti close: can't deregister tunnel\n");
-        unregister_pernet_device(&vti_net_ops);
-}
-module_init(vti_init);
-module_exit(vti_fini);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_RTNL_LINK("vti");
-MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d3ab47e19a8..c857f6f49b0 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -31,26 +31,17 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
        struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
        struct xfrm_state *x;
-        switch (icmp_hdr(skb)->type) {
+        if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
-        case ICMP_DEST_UNREACH:
+            icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                        return;
-        case ICMP_REDIRECT:
-                break;
-        default:
                return;
-        }
        spi = htonl(ntohs(ipch->cpi));
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET);
        if (!x)
                return;
+        NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
-        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+                 spi, &iph->daddr);
-                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
-        else
-                ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
        xfrm_state_put(x);
 }
@@ -165,11 +156,11 @@ static const struct net_protocol ipcomp4_protocol = {
 static int __init ipcomp4_init(void)
 {
        if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
-                pr_info("%s: can't add xfrm type\n", __func__);
+                printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
                return -EAGAIN;
        }
        if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
-                pr_info("%s: can't add protocol\n", __func__);
+                printk(KERN_INFO "ipcomp init: can't add protocol\n");
                xfrm_unregister_type(&ipcomp_type, AF_INET);
                return -EAGAIN;
        }
@@ -179,9 +170,9 @@ static int __init ipcomp4_init(void)
 static void __exit ipcomp4_fini(void)
 {
        if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
-                pr_info("%s: can't remove protocol\n", __func__);
+                printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
        if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
-                pr_info("%s: can't remove xfrm type\n", __func__);
+                printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
 }
 module_init(ipcomp4_init);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index a2e50ae80b5..004bb74b41c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,7 +54,6 @@
 #include <linux/delay.h>
 #include <linux/nfs_fs.h>
 #include <linux/slab.h>
-#include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/arp.h>
 #include <net/ip.h>
@@ -136,14 +135,12 @@ __be32 ic_myaddr = NONE;		/* My IP address */
 static __be32 ic_netmask = NONE;        /* Netmask for local subnet */
 __be32 ic_gateway = NONE;       /* Gateway IP address */
-__be32 ic_addrservaddr = NONE;  /* IP Address of the IP addresses'server */
 __be32 ic_servaddr = NONE;      /* Boot server IP address */
 __be32 root_server_addr = NONE; /* Address of NFS server */
 u8 root_server_path[256] = { 0, };      /* Path to mount as root */
-__be32 ic_dev_xid;              /* Device under configuration */
+u32 ic_dev_xid;         /* Device under configuration */
 /* vendor class identifier */
 static char vendor_class_identifier[253] __initdata;
@@ -216,7 +213,7 @@ static int __init ic_open_devs(void)
                if (!(dev->flags & IFF_LOOPBACK))
                        continue;
                if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
-                        pr_err("IP-Config: Failed to open %s\n", dev->name);
+                        printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
        }
        for_each_netdev(&init_net, dev) {
@@ -225,8 +222,7 @@ static int __init ic_open_devs(void)
                        if (dev->mtu >= 364)
                                able |= IC_BOOTP;
                        else
-                                pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+                                printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
-                                        dev->name, dev->mtu);
                        if (!(dev->flags & IFF_NOARP))
                                able |= IC_RARP;
                        able &= ic_proto_enabled;
@@ -234,8 +230,7 @@ static int __init ic_open_devs(void)
                                continue;
                        oflags = dev->flags;
                        if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
-                                pr_err("IP-Config: Failed to open %s\n",
+                                printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
-                                       dev->name);
                                continue;
                        }
                        if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
@@ -277,10 +272,9 @@ have_carrier:
        if (!ic_first_dev) {
                if (user_dev_name[0])
-                        pr_err("IP-Config: Device `%s' not found\n",
+                        printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
-                               user_dev_name);
                else
-                        pr_err("IP-Config: No network devices available\n");
+                        printk(KERN_ERR "IP-Config: No network devices available.\n");
                return -ENODEV;
        }
        return 0;
@@ -364,20 +358,17 @@ static int __init ic_setup_if(void)
        strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
        set_sockaddr(sin, ic_myaddr, 0);
        if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
-                pr_err("IP-Config: Unable to set interface address (%d)\n",
+                printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
-                       err);
                return -1;
        }
        set_sockaddr(sin, ic_netmask, 0);
        if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
-                pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+                printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
-                       err);
                return -1;
        }
        set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
        if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
-                pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+                printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
-                       err);
                return -1;
        }
        /* Handle the case where we need non-standard MTU on the boot link (a network
@@ -388,8 +379,8 @@ static int __init ic_setup_if(void)
                strcpy(ir.ifr_name, ic_dev->name);
                ir.ifr_mtu = ic_dev_mtu;
                if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
-                        pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+                        printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n",
-                               ic_dev_mtu, err);
+                                         ic_dev_mtu, err);
        }
        return 0;
 }
@@ -404,7 +395,7 @@ static int __init ic_setup_routes(void)
                memset(&rm, 0, sizeof(rm));
                if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
-                        pr_err("IP-Config: Gateway not on directly connected network\n");
+                        printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
                        return -1;
                }
                set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
@@ -412,8 +403,7 @@ static int __init ic_setup_routes(void)
                set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
                rm.rt_flags = RTF_UP | RTF_GATEWAY;
                if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
-                        pr_err("IP-Config: Cannot add default route (%d)\n",
+                        printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
-                               err);
                        return -1;
                }
        }
@@ -446,8 +436,8 @@ static int __init ic_defaults(void)
                else if (IN_CLASSC(ntohl(ic_myaddr)))
                        ic_netmask = htonl(IN_CLASSC_NET);
                else {
-                        pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+                        printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n",
-                               &ic_myaddr);
+                                &ic_myaddr);
                        return -1;
                }
                printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
@@ -560,7 +550,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
        if (ic_myaddr == NONE)
                ic_myaddr = tip;
        ic_servaddr = sip;
-        ic_addrservaddr = sip;
        ic_got_reply = IC_RARP;
 drop_unlock:
@@ -586,17 +575,6 @@ static void __init ic_rarp_send_if(struct ic_device *d)
 #endif
 /*
- *  Predefine Nameservers
- */
-static inline void __init ic_nameservers_predef(void)
-{
-        int i;
-        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
-                ic_nameservers[i] = NONE;
-}
-/*
 *      DHCP/BOOTP support.
 */
@@ -709,8 +687,8 @@ ic_dhcp_init_options(u8 *options)
                        e += len;
                }
                if (*vendor_class_identifier) {
-                        pr_info("DHCP: sending class identifier \"%s\"\n",
+                        printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
-                                vendor_class_identifier);
+                               vendor_class_identifier);
                        *e++ = 60;      /* Class-identifier */
                        len = strlen(vendor_class_identifier);
                        *e++ = len;
@@ -761,7 +739,10 @@ static void __init ic_bootp_init_ext(u8 *e)
 */
 static inline void __init ic_bootp_init(void)
 {
-        ic_nameservers_predef();
+        int i;
+        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+                ic_nameservers[i] = NONE;
        dev_add_pack(&bootp_packet_type);
 }
@@ -785,15 +766,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
        struct sk_buff *skb;
        struct bootp_pkt *b;
        struct iphdr *h;
-        int hlen = LL_RESERVED_SPACE(dev);
-        int tlen = dev->needed_tailroom;
        /* Allocate packet */
-        skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
+        skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15,
                        GFP_KERNEL);
        if (!skb)
                return;
-        skb_reserve(skb, hlen);
+        skb_reserve(skb, LL_RESERVED_SPACE(dev));
        b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
        memset(b, 0, sizeof(struct bootp_pkt));
@@ -819,6 +798,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
        b->op = BOOTP_REQUEST;
        if (dev->type < 256) /* check for false types */
                b->htype = dev->type;
+        else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+                b->htype = ARPHRD_IEEE802;
        else if (dev->type == ARPHRD_FDDI)
                b->htype = ARPHRD_ETHER;
        else {
@@ -844,13 +825,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);
        if (dev_hard_header(skb, dev, ntohs(skb->protocol),
-                            dev->broadcast, dev->dev_addr, skb->len) < 0) {
+                            dev->broadcast, dev->dev_addr, skb->len) < 0 ||
-                kfree_skb(skb);
+            dev_queue_xmit(skb) < 0)
-                printk("E");
-                return;
-        }
-        if (dev_queue_xmit(skb) < 0)
                printk("E");
 }
@@ -875,9 +851,9 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max)
 */
 static void __init ic_do_bootp_ext(u8 *ext)
 {
-        u8 servers;
+       u8 servers;
-        int i;
+       int i;
-        __be16 mtu;
+        u16 mtu;
 #ifdef IPCONFIG_DEBUG
        u8 *c;
@@ -964,7 +940,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
        /* Fragments are not supported */
        if (ip_is_fragment(h)) {
-                net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
+                if (net_ratelimit())
+                        printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
+                               "reply.\n");
                goto drop;
        }
@@ -1012,14 +990,17 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
        /* Is it a reply to our BOOTP request? */
        if (b->op != BOOTP_REPLY ||
            b->xid != d->xid) {
-                net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
+                if (net_ratelimit())
-                                    b->op, b->xid);
+                        printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
+                               "op[%x] xid[%x]\n",
+                               b->op, b->xid);
                goto drop_unlock;
        }
        /* Is it a reply for the device we are configuring? */
        if (b->xid != ic_dev_xid) {
-                net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
+                if (net_ratelimit())
+                        printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
                goto drop_unlock;
        }
@@ -1071,7 +1052,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
                                ic_servaddr = server_id;
 #ifdef IPCONFIG_DEBUG
                                printk("DHCP: Offered address %pI4 by server %pI4\n",
-                                       &ic_myaddr, &b->iph.saddr);
+                                       &ic_myaddr, &ic_servaddr);
 #endif
                                /* The DHCP indicated server address takes
                                 * precedence over the bootp header one if
@@ -1116,7 +1097,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
        ic_dev = dev;
        ic_myaddr = b->your_ip;
        ic_servaddr = b->server_ip;
-        ic_addrservaddr = b->iph.saddr;
        if (ic_gateway == NONE && b->relay_ip)
                ic_gateway = b->relay_ip;
        if (ic_nameservers[0] == NONE)
@@ -1158,17 +1138,17 @@ static int __init ic_dynamic(void)
         * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
         */
        if (!ic_proto_enabled) {
-                pr_err("IP-Config: Incomplete network configuration information\n");
+                printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
                return -1;
        }
 #ifdef IPCONFIG_BOOTP
        if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
-                pr_err("DHCP/BOOTP: No suitable device found\n");
+                printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
 #endif
 #ifdef IPCONFIG_RARP
        if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
-                pr_err("RARP: No suitable device found\n");
+                printk(KERN_ERR "RARP: No suitable device found.\n");
 #endif
        if (!ic_proto_have_if)
@@ -1195,17 +1175,17 @@ static int __init ic_dynamic(void)
         * [Actually we could now, but the nothing else running note still
         *  applies.. - AC]
         */
-        pr_notice("Sending %s%s%s requests .",
+        printk(KERN_NOTICE "Sending %s%s%s requests .",
-                  do_bootp
+               do_bootp
-                  ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+                ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
-                  (do_bootp && do_rarp) ? " and " : "",
+               (do_bootp && do_rarp) ? " and " : "",
-                  do_rarp ? "RARP" : "");
+               do_rarp ? "RARP" : "");
        start_jiffies = jiffies;
        d = ic_first_dev;
        retries = CONF_SEND_RETRIES;
        get_random_bytes(&timeout, sizeof(timeout));
-        timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
+        timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
        for (;;) {
                /* Track the device we are configuring */
                ic_dev_xid = d->xid;
@@ -1228,13 +1208,13 @@ static int __init ic_dynamic(void)
                    (ic_proto_enabled & IC_USE_DHCP) &&
                    ic_dhcp_msgtype != DHCPACK) {
                        ic_got_reply = 0;
-                        pr_cont(",");
+                        printk(KERN_CONT ",");
                        continue;
                }
 #endif /* IPCONFIG_DHCP */
                if (ic_got_reply) {
-                        pr_cont(" OK\n");
+                        printk(KERN_CONT " OK\n");
                        break;
                }
@@ -1242,7 +1222,7 @@ static int __init ic_dynamic(void)
                        continue;
                if (! --retries) {
-                        pr_cont(" timed out!\n");
+                        printk(KERN_CONT " timed out!\n");
                        break;
                }
@@ -1252,7 +1232,7 @@ static int __init ic_dynamic(void)
                if (timeout > CONF_TIMEOUT_MAX)
                        timeout = CONF_TIMEOUT_MAX;
-                pr_cont(".");
+                printk(KERN_CONT ".");
        }
 #ifdef IPCONFIG_BOOTP
@@ -1272,8 +1252,8 @@ static int __init ic_dynamic(void)
        printk("IP-Config: Got %s answer from %pI4, ",
                ((ic_got_reply & IC_RARP) ? "RARP"
                 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
-               &ic_addrservaddr);
+                &ic_servaddr);
-        pr_cont("my address is %pI4\n", &ic_myaddr);
+        printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
        return 0;
 }
@@ -1391,7 +1371,6 @@ static int __init ip_auto_config(void)
        int retries = CONF_OPEN_RETRIES;
 #endif
        int err;
-        unsigned int i;
 #ifdef CONFIG_PROC_FS
        proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1450,22 +1429,24 @@ static int __init ip_auto_config(void)
                         */
 #ifdef CONFIG_ROOT_NFS
                        if (ROOT_DEV ==  Root_NFS) {
-                                pr_err("IP-Config: Retrying forever (NFS root)...\n");
+                                printk(KERN_ERR
+                                        "IP-Config: Retrying forever (NFS root)...\n");
                                goto try_try_again;
                        }
 #endif
                        if (--retries) {
-                                pr_err("IP-Config: Reopening network devices...\n");
+                                printk(KERN_ERR
+                                       "IP-Config: Reopening network devices...\n");
                                goto try_try_again;
                        }
                        /* Oh, well.  At least we tried. */
-                        pr_err("IP-Config: Auto-configuration of network failed\n");
+                        printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
                        return -1;
                }
 #else /* !DYNAMIC */
-                pr_err("IP-Config: Incomplete network configuration information\n");
+                printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
                ic_close_devs();
                return -1;
 #endif /* IPCONFIG_DYNAMIC */
@@ -1503,26 +1484,19 @@ static int __init ip_auto_config(void)
        /*
         * Clue in the operator.
         */
-        pr_info("IP-Config: Complete:\n");
+        printk("IP-Config: Complete:\n");
+        printk("     device=%s", ic_dev->name);
-        pr_info("     device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+        printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
-                ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr,
+        printk(KERN_CONT ", mask=%pI4", &ic_netmask);
-                &ic_myaddr, &ic_netmask, &ic_gateway);
+        printk(KERN_CONT ", gw=%pI4", &ic_gateway);
-        pr_info("     host=%s, domain=%s, nis-domain=%s\n",
+        printk(KERN_CONT ",\n     host=%s, domain=%s, nis-domain=%s",
-                utsname()->nodename, ic_domain, utsname()->domainname);
+               utsname()->nodename, ic_domain, utsname()->domainname);
-        pr_info("     bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+        printk(KERN_CONT ",\n     bootserver=%pI4", &ic_servaddr);
-                &ic_servaddr, &root_server_addr, root_server_path);
+        printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
+        printk(KERN_CONT ", rootpath=%s", root_server_path);
        if (ic_dev_mtu)
-                pr_cont(", mtu=%d", ic_dev_mtu);
+                printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
-        for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+        printk(KERN_CONT "\n");
-                if (ic_nameservers[i] != NONE) {
-                        pr_info("     nameserver%u=%pI4",
-                                i, &ic_nameservers[i]);
-                        break;
-                }
-        for (i++; i < CONF_NAMESERVERS_MAX; i++)
-                if (ic_nameservers[i] != NONE)
-                        pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
 #endif /* !SILENT */
        return 0;
@@ -1593,8 +1567,6 @@ static int __init ip_auto_config_setup(char *addrs)
                return 1;
        }
-        ic_nameservers_predef();
        /* Parse string for static IP assignment.  */
        ip = addrs;
        while (ip && *ip) {
@@ -1638,20 +1610,6 @@ static int __init ip_auto_config_setup(char *addrs)
                                        ic_enable = 0;
                                }
                                break;
-                        case 7:
-                                if (CONF_NAMESERVERS_MAX >= 1) {
-                                        ic_nameservers[0] = in_aton(ip);
-                                        if (ic_nameservers[0] == ANY)
-                                                ic_nameservers[0] = NONE;
-                                }
-                                break;
-                        case 8:
-                                if (CONF_NAMESERVERS_MAX >= 2) {
-                                        ic_nameservers[1] = in_aton(ip);
-                                        if (ic_nameservers[1] == ANY)
-                                                ic_nameservers[1] = NONE;
-                                }
-                                break;
                        }
                }
                ip = cp;
@@ -1660,21 +1618,22 @@ static int __init ip_auto_config_setup(char *addrs)
        return 1;
 }
-__setup("ip=", ip_auto_config_setup);
 static int __init nfsaddrs_config_setup(char *addrs)
 {
        return ip_auto_config_setup(addrs);
 }
-__setup("nfsaddrs=", nfsaddrs_config_setup);
 static int __init vendor_class_identifier_setup(char *addrs)
 {
        if (strlcpy(vendor_class_identifier, addrs,
                    sizeof(vendor_class_identifier))
            >= sizeof(vendor_class_identifier))
-                pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+                printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"",
-                        vendor_class_identifier);
+                       vendor_class_identifier);
        return 1;
 }
+__setup("ip=", ip_auto_config_setup);
+__setup("nfsaddrs=", nfsaddrs_config_setup);
 __setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 191fc24a745..6f06f7f39ea 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -120,10 +120,6 @@
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-static bool log_ecn_error = true;
-module_param(log_ecn_error, bool, 0644);
-MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static int ipip_net_id __read_mostly;
 struct ipip_net {
        struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -138,43 +134,43 @@ struct ipip_net {
 static int ipip_tunnel_init(struct net_device *dev);
 static void ipip_tunnel_setup(struct net_device *dev);
 static void ipip_dev_free(struct net_device *dev);
-static struct rtnl_link_ops ipip_link_ops __read_mostly;
-static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
+/*
-                                                  struct rtnl_link_stats64 *tot)
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+#define for_each_ip_tunnel_rcu(start) \
+        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+        unsigned long   rx_packets;
+        unsigned long   rx_bytes;
+        unsigned long   tx_packets;
+        unsigned long   tx_bytes;
+};
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
 {
+        struct pcpu_tstats sum = { 0 };
        int i;
        for_each_possible_cpu(i) {
                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
-                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
-                unsigned int start;
-                do {
-                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
-                        rx_packets = tstats->rx_packets;
-                        tx_packets = tstats->tx_packets;
-                        rx_bytes = tstats->rx_bytes;
-                        tx_bytes = tstats->tx_bytes;
-                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
-                tot->rx_packets += rx_packets;
-                tot->tx_packets += tx_packets;
-                tot->rx_bytes   += rx_bytes;
-                tot->tx_bytes   += tx_bytes;
-        }
-        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
-        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
-        tot->tx_dropped = dev->stats.tx_dropped;
-        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
-        tot->tx_errors = dev->stats.tx_errors;
-        tot->collisions = dev->stats.collisions;
-        return tot;
+                sum.rx_packets += tstats->rx_packets;
+                sum.rx_bytes   += tstats->rx_bytes;
+                sum.tx_packets += tstats->tx_packets;
+                sum.tx_bytes   += tstats->tx_bytes;
+        }
+        dev->stats.rx_packets = sum.rx_packets;
+        dev->stats.rx_bytes   = sum.rx_bytes;
+        dev->stats.tx_packets = sum.tx_packets;
+        dev->stats.tx_bytes   = sum.tx_bytes;
+        return &dev->stats;
 }
-static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
+static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
                __be32 remote, __be32 local)
 {
        unsigned int h0 = HASH(remote);
@@ -182,16 +178,16 @@ static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
        struct ip_tunnel *t;
        struct ipip_net *ipn = net_generic(net, ipip_net_id);
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
+        for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
                if (local == t->parms.iph.saddr &&
                    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
+        for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
                if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
                        return t;
-        for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
+        for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
                if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
                        return t;
@@ -249,33 +245,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
        rcu_assign_pointer(*tp, t);
 }
-static int ipip_tunnel_create(struct net_device *dev)
+static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
-{
-        struct ip_tunnel *t = netdev_priv(dev);
-        struct net *net = dev_net(dev);
-        struct ipip_net *ipn = net_generic(net, ipip_net_id);
-        int err;
-        err = ipip_tunnel_init(dev);
-        if (err < 0)
-                goto out;
-        err = register_netdevice(dev);
-        if (err < 0)
-                goto out;
-        strcpy(t->parms.name, dev->name);
-        dev->rtnl_link_ops = &ipip_link_ops;
-        dev_hold(dev);
-        ipip_tunnel_link(ipn, t);
-        return 0;
-out:
-        return err;
-}
-static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
                struct ip_tunnel_parm *parms, int create)
 {
        __be32 remote = parms->iph.daddr;
@@ -309,9 +279,16 @@ static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
        nt = netdev_priv(dev);
        nt->parms = *parms;
-        if (ipip_tunnel_create(dev) < 0)
+        if (ipip_tunnel_init(dev) < 0)
+                goto failed_free;
+        if (register_netdevice(dev) < 0)
                goto failed_free;
+        strcpy(nt->parms.name, dev->name);
+        dev_hold(dev);
+        ipip_tunnel_link(ipn, nt);
        return nt;
 failed_free:
@@ -326,7 +303,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
        struct ipip_net *ipn = net_generic(net, ipip_net_id);
        if (dev == ipn->fb_tunnel_dev)
-                RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
+                rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
        else
                ipip_tunnel_unlink(ipn, netdev_priv(dev));
        dev_put(dev);
@@ -356,6 +333,9 @@ static int ipip_err(struct sk_buff *skb, u32 info)
                case ICMP_PORT_UNREACH:
                        /* Impossible event. */
                        return 0;
+                case ICMP_FRAG_NEEDED:
+                        /* Soft state for pmtu is maintained by IP core. */
+                        return 0;
                default:
                        /* All others are translated to HOST_UNREACH.
                           rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -368,30 +348,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
                if (code != ICMP_EXC_TTL)
                        return 0;
                break;
-        case ICMP_REDIRECT:
-                break;
        }
        err = -ENOENT;
-        t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
-        if (t == NULL)
-                goto out;
-        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+        rcu_read_lock();
-                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+        t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
-                                 t->dev->ifindex, 0, IPPROTO_IPIP, 0);
+        if (t == NULL || t->parms.iph.daddr == 0)
-                err = 0;
-                goto out;
-        }
-        if (type == ICMP_REDIRECT) {
-                ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
-                              IPPROTO_IPIP, 0);
-                err = 0;
-                goto out;
-        }
-        if (t->parms.iph.daddr == 0)
                goto out;
        err = 0;
@@ -404,22 +367,34 @@ static int ipip_err(struct sk_buff *skb, u32 info)
                t->err_count = 1;
        t->err_time = jiffies;
 out:
+        rcu_read_unlock();
        return err;
 }
+static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
+                                        struct sk_buff *skb)
+{
+        struct iphdr *inner_iph = ip_hdr(skb);
+        if (INET_ECN_is_ce(outer_iph->tos))
+                IP_ECN_set_ce(inner_iph);
+}
 static int ipip_rcv(struct sk_buff *skb)
 {
        struct ip_tunnel *tunnel;
        const struct iphdr *iph = ip_hdr(skb);
-        int err;
+        rcu_read_lock();
        tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
        if (tunnel != NULL) {
                struct pcpu_tstats *tstats;
-                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-                        goto drop;
+                        rcu_read_unlock();
+                        kfree_skb(skb);
+                        return 0;
+                }
                secpath_reset(skb);
@@ -428,35 +403,22 @@ static int ipip_rcv(struct sk_buff *skb)
                skb->protocol = htons(ETH_P_IP);
                skb->pkt_type = PACKET_HOST;
-                __skb_tunnel_rx(skb, tunnel->dev);
-                err = IP_ECN_decapsulate(iph, skb);
-                if (unlikely(err)) {
-                        if (log_ecn_error)
-                                net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
-                                                     &iph->saddr, iph->tos);
-                        if (err > 1) {
-                                ++tunnel->dev->stats.rx_frame_errors;
-                                ++tunnel->dev->stats.rx_errors;
-                                goto drop;
-                        }
-                }
                tstats = this_cpu_ptr(tunnel->dev->tstats);
-                u64_stats_update_begin(&tstats->syncp);
                tstats->rx_packets++;
                tstats->rx_bytes += skb->len;
-                u64_stats_update_end(&tstats->syncp);
+                __skb_tunnel_rx(skb, tunnel->dev);
+                ipip_ecn_decapsulate(iph, skb);
                netif_rx(skb);
+                rcu_read_unlock();
                return 0;
        }
+        rcu_read_unlock();
        return -1;
-drop:
-        kfree_skb(skb);
-        return 0;
 }
 /*
@@ -467,6 +429,7 @@ drop:
 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct pcpu_tstats *tstats;
        const struct iphdr  *tiph = &tunnel->parms.iph;
        u8     tos = tunnel->parms.iph.tos;
        __be16 df = tiph->frag_off;
@@ -482,10 +445,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
        if (skb->protocol != htons(ETH_P_IP))
                goto tx_error;
-        if (skb->ip_summed == CHECKSUM_PARTIAL &&
-            skb_checksum_help(skb))
-                goto tx_error;
        if (tos & 1)
                tos = old_iph->tos;
@@ -495,7 +454,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                        dev->stats.tx_fifo_errors++;
                        goto tx_error;
                }
-                dst = rt_nexthop(rt, old_iph->daddr);
+                if ((dst = rt->rt_gateway) == 0)
+                        goto tx_error_icmp;
        }
        rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
@@ -527,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                }
                if (skb_dst(skb))
-                        skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+                        skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
                if ((old_iph->frag_off & htons(IP_DF)) &&
                    mtu < ntohs(old_iph->tot_len)) {
@@ -593,7 +553,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
        if ((iph->ttl = tiph->ttl) == 0)
                iph->ttl        =       old_iph->ttl;
-        iptunnel_xmit(skb, dev);
+        nf_reset(skb);
+        tstats = this_cpu_ptr(dev->tstats);
+        __IPTUNNEL_XMIT(tstats, &dev->stats);
        return NETDEV_TX_OK;
 tx_error_icmp:
@@ -640,28 +602,6 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
        dev->iflink = tunnel->parms.link;
 }
-static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
-{
-        struct net *net = dev_net(t->dev);
-        struct ipip_net *ipn = net_generic(net, ipip_net_id);
-        ipip_tunnel_unlink(ipn, t);
-        synchronize_net();
-        t->parms.iph.saddr = p->iph.saddr;
-        t->parms.iph.daddr = p->iph.daddr;
-        memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
-        memcpy(t->dev->broadcast, &p->iph.daddr, 4);
-        ipip_tunnel_link(ipn, t);
-        t->parms.iph.ttl = p->iph.ttl;
-        t->parms.iph.tos = p->iph.tos;
-        t->parms.iph.frag_off = p->iph.frag_off;
-        if (t->parms.link != p->link) {
-                t->parms.link = p->link;
-                ipip_tunnel_bind_dev(t->dev);
-        }
-        netdev_state_change(t->dev);
-}
 static int
 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 {
@@ -691,7 +631,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
        case SIOCADDTUNNEL:
        case SIOCCHGTUNNEL:
                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        goto done;
                err = -EFAULT;
@@ -720,13 +660,29 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
                                        break;
                                }
                                t = netdev_priv(dev);
+                                ipip_tunnel_unlink(ipn, t);
+                                synchronize_net();
+                                t->parms.iph.saddr = p.iph.saddr;
+                                t->parms.iph.daddr = p.iph.daddr;
+                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
+                                memcpy(dev->broadcast, &p.iph.daddr, 4);
+                                ipip_tunnel_link(ipn, t);
+                                netdev_state_change(dev);
                        }
-                        ipip_tunnel_update(t, &p);
                }
                if (t) {
                        err = 0;
+                        if (cmd == SIOCCHGTUNNEL) {
+                                t->parms.iph.ttl = p.iph.ttl;
+                                t->parms.iph.tos = p.iph.tos;
+                                t->parms.iph.frag_off = p.iph.frag_off;
+                                if (t->parms.link != p.link) {
+                                        t->parms.link = p.link;
+                                        ipip_tunnel_bind_dev(dev);
+                                        netdev_state_change(dev);
+                                }
+                        }
                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
                                err = -EFAULT;
                } else
@@ -735,7 +691,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
        case SIOCDELTUNNEL:
                err = -EPERM;
-                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+                if (!capable(CAP_NET_ADMIN))
                        goto done;
                if (dev == ipn->fb_tunnel_dev) {
@@ -775,7 +731,7 @@ static const struct net_device_ops ipip_netdev_ops = {
        .ndo_start_xmit = ipip_tunnel_xmit,
        .ndo_do_ioctl   = ipip_tunnel_ioctl,
        .ndo_change_mtu = ipip_tunnel_change_mtu,
-        .ndo_get_stats64 = ipip_get_stats64,
+        .ndo_get_stats  = ipip_get_stats,
 };
 static void ipip_dev_free(struct net_device *dev)
@@ -784,11 +740,6 @@ static void ipip_dev_free(struct net_device *dev)
        free_netdev(dev);
 }
-#define IPIP_FEATURES (NETIF_F_SG |             \
-                       NETIF_F_FRAGLIST |       \
-                       NETIF_F_HIGHDMA |        \
-                       NETIF_F_HW_CSUM)
 static void ipip_tunnel_setup(struct net_device *dev)
 {
        dev->netdev_ops         = &ipip_netdev_ops;
@@ -803,9 +754,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
        dev->features           |= NETIF_F_NETNS_LOCAL;
        dev->features           |= NETIF_F_LLTX;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
-        dev->features           |= IPIP_FEATURES;
-        dev->hw_features        |= IPIP_FEATURES;
 }
 static int ipip_tunnel_init(struct net_device *dev)
@@ -848,142 +796,6 @@ static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
        return 0;
 }
-static void ipip_netlink_parms(struct nlattr *data[],
-                               struct ip_tunnel_parm *parms)
-{
-        memset(parms, 0, sizeof(*parms));
-        parms->iph.version = 4;
-        parms->iph.protocol = IPPROTO_IPIP;
-        parms->iph.ihl = 5;
-        if (!data)
-                return;
-        if (data[IFLA_IPTUN_LINK])
-                parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
-        if (data[IFLA_IPTUN_LOCAL])
-                parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
-        if (data[IFLA_IPTUN_REMOTE])
-                parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
-        if (data[IFLA_IPTUN_TTL]) {
-                parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
-                if (parms->iph.ttl)
-                        parms->iph.frag_off = htons(IP_DF);
-        }
-        if (data[IFLA_IPTUN_TOS])
-                parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
-        if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
-                parms->iph.frag_off = htons(IP_DF);
-}
-static int ipip_newlink(struct net *src_net, struct net_device *dev,
-                        struct nlattr *tb[], struct nlattr *data[])
-{
-        struct net *net = dev_net(dev);
-        struct ip_tunnel *nt;
-        nt = netdev_priv(dev);
-        ipip_netlink_parms(data, &nt->parms);
-        if (ipip_tunnel_locate(net, &nt->parms, 0))
-                return -EEXIST;
-        return ipip_tunnel_create(dev);
-}
-static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
-                           struct nlattr *data[])
-{
-        struct ip_tunnel *t;
-        struct ip_tunnel_parm p;
-        struct net *net = dev_net(dev);
-        struct ipip_net *ipn = net_generic(net, ipip_net_id);
-        if (dev == ipn->fb_tunnel_dev)
-                return -EINVAL;
-        ipip_netlink_parms(data, &p);
-        if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
-            (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
-                return -EINVAL;
-        t = ipip_tunnel_locate(net, &p, 0);
-        if (t) {
-                if (t->dev != dev)
-                        return -EEXIST;
-        } else
-                t = netdev_priv(dev);
-        ipip_tunnel_update(t, &p);
-        return 0;
-}
-static size_t ipip_get_size(const struct net_device *dev)
-{
-        return
-                /* IFLA_IPTUN_LINK */
-                nla_total_size(4) +
-                /* IFLA_IPTUN_LOCAL */
-                nla_total_size(4) +
-                /* IFLA_IPTUN_REMOTE */
-                nla_total_size(4) +
-                /* IFLA_IPTUN_TTL */
-                nla_total_size(1) +
-                /* IFLA_IPTUN_TOS */
-                nla_total_size(1) +
-                /* IFLA_IPTUN_PMTUDISC */
-                nla_total_size(1) +
-                0;
-}
-static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
-{
-        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct ip_tunnel_parm *parm = &tunnel->parms;
-        if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
-            nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
-            nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
-            nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
-            nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
-            nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
-                       !!(parm->iph.frag_off & htons(IP_DF))))
-                goto nla_put_failure;
-        return 0;
-nla_put_failure:
-        return -EMSGSIZE;
-}
-static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
-        [IFLA_IPTUN_LINK]               = { .type = NLA_U32 },
-        [IFLA_IPTUN_LOCAL]              = { .type = NLA_U32 },
-        [IFLA_IPTUN_REMOTE]             = { .type = NLA_U32 },
-        [IFLA_IPTUN_TTL]                = { .type = NLA_U8 },
-        [IFLA_IPTUN_TOS]                = { .type = NLA_U8 },
-        [IFLA_IPTUN_PMTUDISC]           = { .type = NLA_U8 },
-};
-static struct rtnl_link_ops ipip_link_ops __read_mostly = {
-        .kind           = "ipip",
-        .maxtype        = IFLA_IPTUN_MAX,
-        .policy         = ipip_policy,
-        .priv_size      = sizeof(struct ip_tunnel),
-        .setup          = ipip_tunnel_setup,
-        .newlink        = ipip_newlink,
-        .changelink     = ipip_changelink,
-        .get_size       = ipip_get_size,
-        .fill_info      = ipip_fill_info,
-};
 static struct xfrm_tunnel ipip_handler __read_mostly = {
        .handler        =       ipip_rcv,
        .err_handler    =       ipip_err,
@@ -1080,28 +892,16 @@ static int __init ipip_init(void)
                return err;
        err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
        if (err < 0) {
-                pr_info("%s: can't register tunnel\n", __func__);
+                unregister_pernet_device(&ipip_net_ops);
-                goto xfrm_tunnel_failed;
+                printk(KERN_INFO "ipip init: can't register tunnel\n");
        }
-        err = rtnl_link_register(&ipip_link_ops);
-        if (err < 0)
-                goto rtnl_link_failed;
-out:
        return err;
-rtnl_link_failed:
-        xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
-xfrm_tunnel_failed:
-        unregister_pernet_device(&ipip_net_ops);
-        goto out;
 }
 static void __exit ipip_fini(void)
 {
-        rtnl_link_unregister(&ipip_link_ops);
        if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
-                pr_info("%s: can't deregister tunnel\n", __func__);
+                printk(KERN_INFO "ipip close: can't deregister tunnel\n");
        unregister_pernet_device(&ipip_net_ops);
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a9454cbd953..58e87915797 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,6 +26,7 @@
 *
 */
+#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <linux/types.h>
 #include <linux/capability.h>
@@ -60,12 +61,10 @@
 #include <linux/if_arp.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
-#include <linux/export.h>
 #include <net/ipip.h>
 #include <net/checksum.h>
 #include <net/netlink.h>
 #include <net/fib_rules.h>
-#include <linux/netconf.h>
 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
 #define CONFIG_IP_PIMSM 1
@@ -84,8 +83,8 @@ struct mr_table {
        struct vif_device       vif_table[MAXVIFS];
        int                     maxvif;
        atomic_t                cache_resolve_queue_len;
-        bool                    mroute_do_assert;
+        int                     mroute_do_assert;
-        bool                    mroute_do_pim;
+        int                     mroute_do_pim;
 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
        int                     mroute_reg_vif_num;
 #endif
@@ -125,8 +124,6 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 static struct kmem_cache *mrt_cachep __read_mostly;
 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
-static void ipmr_free_table(struct mr_table *mrt);
 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
                         struct sk_buff *skb, struct mfc_cache *cache,
                         int local);
@@ -134,9 +131,6 @@ static int ipmr_cache_report(struct mr_table *mrt,
                             struct sk_buff *pkt, vifi_t vifi, int assert);
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                              struct mfc_cache *c, struct rtmsg *rtm);
-static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
-                                 int cmd);
-static void mroute_clean_tables(struct mr_table *mrt);
 static void ipmr_expire_process(unsigned long arg);
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -224,7 +218,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
        return 0;
 }
-static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
+static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
        .family         = RTNL_FAMILY_IPMR,
        .rule_size      = sizeof(struct ipmr_rule),
        .addr_size      = sizeof(u32),
@@ -277,7 +271,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
                list_del(&mrt->list);
-                ipmr_free_table(mrt);
+                kfree(mrt);
        }
        fib_rules_unregister(net->ipv4.mr_rules_ops);
 }
@@ -305,7 +299,7 @@ static int __net_init ipmr_rules_init(struct net *net)
 static void __net_exit ipmr_rules_exit(struct net *net)
 {
-        ipmr_free_table(net->ipv4.mrt);
+        kfree(net->ipv4.mrt);
 }
 #endif
@@ -342,13 +336,6 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
        return mrt;
 }
-static void ipmr_free_table(struct mr_table *mrt)
-{
-        del_timer_sync(&mrt->ipmr_expire_timer);
-        mroute_clean_tables(mrt);
-        kfree(mrt);
-}
 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -537,8 +524,8 @@ failure:
 }
 #endif
-/**
+/*
- *      vif_delete - Delete a VIF entry
+ *      Delete a VIF entry
 *      @notify: Set to 1, if the caller is a notifier_call
 */
@@ -585,9 +572,6 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
        in_dev = __in_dev_get_rtnl(dev);
        if (in_dev) {
                IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
-                inet_netconf_notify_devconf(dev_net(dev),
-                                            NETCONFA_MC_FORWARDING,
-                                            dev->ifindex, &in_dev->cnf);
                ip_rt_multicast_event(in_dev);
        }
@@ -632,7 +616,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
                        e->error = -ETIMEDOUT;
                        memset(&e->msg, 0, sizeof(e->msg));
-                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
                } else {
                        kfree_skb(skb);
                }
@@ -671,7 +655,6 @@ static void ipmr_expire_process(unsigned long arg)
                }
                list_del(&c->list);
-                mroute_netlink_event(mrt, c, RTM_DELROUTE);
                ipmr_destroy_unres(mrt, c);
        }
@@ -779,8 +762,6 @@ static int vif_add(struct net *net, struct mr_table *mrt,
                return -EADDRNOTAVAIL;
        }
        IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
-        inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
-                                    &in_dev->cnf);
        ip_rt_multicast_event(in_dev);
        /* Fill in the VIF structures */
@@ -879,7 +860,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
                                memset(&e->msg, 0, sizeof(e->msg));
                        }
-                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+                        rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
                } else {
                        ip_mr_forward(net, mrt, skb, c, 0);
                }
@@ -968,7 +949,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
        ret = sock_queue_rcv_skb(mroute_sk, skb);
        rcu_read_unlock();
        if (ret < 0) {
-                net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
+                if (net_ratelimit())
+                        printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
                kfree_skb(skb);
        }
@@ -1029,7 +1011,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
                atomic_inc(&mrt->cache_resolve_queue_len);
                list_add(&c->list, &mrt->mfc_unres_queue);
-                mroute_netlink_event(mrt, c, RTM_NEWROUTE);
                if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
                        mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
@@ -1064,7 +1045,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
                if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
                    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
                        list_del_rcu(&c->list);
-                        mroute_netlink_event(mrt, c, RTM_DELROUTE);
                        ipmr_cache_free(c);
                        return 0;
                }
@@ -1099,7 +1080,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
                if (!mrtsock)
                        c->mfc_flags |= MFC_STATIC;
                write_unlock_bh(&mrt_lock);
-                mroute_netlink_event(mrt, c, RTM_NEWROUTE);
                return 0;
        }
@@ -1142,7 +1122,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
                ipmr_cache_resolve(net, mrt, uc, c);
                ipmr_cache_free(uc);
        }
-        mroute_netlink_event(mrt, c, RTM_NEWROUTE);
        return 0;
 }
@@ -1171,7 +1150,6 @@ static void mroute_clean_tables(struct mr_table *mrt)
                        if (c->mfc_flags & MFC_STATIC)
                                continue;
                        list_del_rcu(&c->list);
-                        mroute_netlink_event(mrt, c, RTM_DELROUTE);
                        ipmr_cache_free(c);
                }
        }
@@ -1180,7 +1158,6 @@ static void mroute_clean_tables(struct mr_table *mrt)
                spin_lock_bh(&mfc_unres_lock);
                list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
                        list_del(&c->list);
-                        mroute_netlink_event(mrt, c, RTM_DELROUTE);
                        ipmr_destroy_unres(mrt, c);
                }
                spin_unlock_bh(&mfc_unres_lock);
@@ -1199,10 +1176,7 @@ static void mrtsock_destruct(struct sock *sk)
        ipmr_for_each_table(mrt, net) {
                if (sk == rtnl_dereference(mrt->mroute_sk)) {
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-                        inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+                        rcu_assign_pointer(mrt->mroute_sk, NULL);
-                                                    NETCONFA_IFINDEX_ALL,
-                                                    net->ipv4.devconf_all);
-                        RCU_INIT_POINTER(mrt->mroute_sk, NULL);
                        mroute_clean_tables(mrt);
                }
        }
@@ -1224,24 +1198,23 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
        struct net *net = sock_net(sk);
        struct mr_table *mrt;
-        if (sk->sk_type != SOCK_RAW ||
-            inet_sk(sk)->inet_num != IPPROTO_IGMP)
-                return -EOPNOTSUPP;
        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
        if (mrt == NULL)
                return -ENOENT;
        if (optname != MRT_INIT) {
-                if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+                if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
-                    !ns_capable(net->user_ns, CAP_NET_ADMIN))
+                    !capable(CAP_NET_ADMIN))
                        return -EACCES;
        }
        switch (optname) {
        case MRT_INIT:
+                if (sk->sk_type != SOCK_RAW ||
+                    inet_sk(sk)->inet_num != IPPROTO_IGMP)
+                        return -EOPNOTSUPP;
                if (optlen != sizeof(int))
-                        return -EINVAL;
+                        return -ENOPROTOOPT;
                rtnl_lock();
                if (rtnl_dereference(mrt->mroute_sk)) {
@@ -1253,14 +1226,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                if (ret == 0) {
                        rcu_assign_pointer(mrt->mroute_sk, sk);
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
-                        inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
-                                                    NETCONFA_IFINDEX_ALL,
-                                                    net->ipv4.devconf_all);
                }
                rtnl_unlock();
                return ret;
        case MRT_DONE:
-                if (sk != rcu_access_pointer(mrt->mroute_sk))
+                if (sk != rcu_dereference_raw(mrt->mroute_sk))
                        return -EACCES;
                return ip_ra_control(sk, 0, NULL);
        case MRT_ADD_VIF:
@@ -1305,11 +1275,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
        case MRT_ASSERT:
        {
                int v;
-                if (optlen != sizeof(v))
-                        return -EINVAL;
                if (get_user(v, (int __user *)optval))
                        return -EFAULT;
-                mrt->mroute_do_assert = v;
+                mrt->mroute_do_assert = (v) ? 1 : 0;
                return 0;
        }
 #ifdef CONFIG_IP_PIMSM
@@ -1317,11 +1285,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
        {
                int v;
-                if (optlen != sizeof(v))
-                        return -EINVAL;
                if (get_user(v, (int __user *)optval))
                        return -EFAULT;
-                v = !!v;
+                v = (v) ? 1 : 0;
                rtnl_lock();
                ret = 0;
@@ -1343,10 +1309,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                if (get_user(v, (u32 __user *)optval))
                        return -EFAULT;
-                /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
-                if (v != RT_TABLE_DEFAULT && v >= 1000000000)
-                        return -EINVAL;
                rtnl_lock();
                ret = 0;
                if (sk == rtnl_dereference(mrt->mroute_sk)) {
@@ -1354,8 +1316,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
                } else {
                        if (!ipmr_new_table(net, v))
                                ret = -ENOMEM;
-                        else
+                        raw_sk(sk)->ipmr_table = v;
-                                raw_sk(sk)->ipmr_table = v;
                }
                rtnl_unlock();
                return ret;
@@ -1381,10 +1342,6 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
        struct net *net = sock_net(sk);
        struct mr_table *mrt;
-        if (sk->sk_type != SOCK_RAW ||
-            inet_sk(sk)->inet_num != IPPROTO_IGMP)
-                return -EOPNOTSUPP;
        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
        if (mrt == NULL)
                return -ENOENT;
@@ -1562,6 +1519,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
        struct mr_table *mrt;
        struct vif_device *v;
        int ct;
+        LIST_HEAD(list);
        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;
@@ -1570,9 +1528,10 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
                v = &mrt->vif_table[0];
                for (ct = 0; ct < mrt->maxvif; ct++, v++) {
                        if (v->dev == dev)
-                                vif_delete(mrt, ct, 1, NULL);
+                                vif_delete(mrt, ct, 1, &list);
                }
        }
+        unregister_netdevice_many(&list);
        return NOTIFY_DONE;
 }
@@ -1618,7 +1577,6 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
        struct ip_options *opt = &(IPCB(skb)->opt);
        IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
-        IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
        if (unlikely(opt->optlen))
                ip_forward_options(skb);
@@ -1839,12 +1797,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowi4_tos = RT_TOS(iph->tos),
-                .flowi4_oif = (rt_is_output_route(rt) ?
+                .flowi4_oif = rt->rt_oif,
-                               skb->dev->ifindex : 0),
+                .flowi4_iif = rt->rt_iif,
-                .flowi4_iif = (rt_is_output_route(rt) ?
+                .flowi4_mark = rt->rt_mark,
-                               LOOPBACK_IFINDEX :
-                               skb->dev->ifindex),
-                .flowi4_mark = skb->mark,
        };
        struct mr_table *mrt;
        int err;
@@ -2053,44 +2008,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 {
        int ct;
        struct rtnexthop *nhp;
-        struct nlattr *mp_attr;
+        u8 *b = skb_tail_pointer(skb);
-        struct rta_mfc_stats mfcs;
+        struct rtattr *mp_head;
        /* If cache is unresolved, don't try to parse IIF and OIF */
        if (c->mfc_parent >= MAXVIFS)
                return -ENOENT;
-        if (VIF_EXISTS(mrt, c->mfc_parent) &&
+        if (VIF_EXISTS(mrt, c->mfc_parent))
-            nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+                RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
-                return -EMSGSIZE;
-        if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+        mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
-                return -EMSGSIZE;
        for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
                if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
-                        if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
+                        if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
-                                nla_nest_cancel(skb, mp_attr);
+                                goto rtattr_failure;
-                                return -EMSGSIZE;
+                        nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
-                        }
                        nhp->rtnh_flags = 0;
                        nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
                        nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
                        nhp->rtnh_len = sizeof(*nhp);
                }
        }
+        mp_head->rta_type = RTA_MULTIPATH;
-        nla_nest_end(skb, mp_attr);
+        mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
-        mfcs.mfcs_packets = c->mfc_un.res.pkt;
-        mfcs.mfcs_bytes = c->mfc_un.res.bytes;
-        mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
-        if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
-                return -EMSGSIZE;
        rtm->rtm_type = RTN_MULTICAST;
        return 1;
+rtattr_failure:
+        nlmsg_trim(skb, b);
+        return -EMSGSIZE;
 }
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
@@ -2158,13 +2106,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 }
 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-                            u32 portid, u32 seq, struct mfc_cache *c, int cmd)
+                            u32 pid, u32 seq, struct mfc_cache *c)
 {
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
-        int err;
-        nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI);
+        nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -2174,22 +2121,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
        rtm->rtm_src_len  = 32;
        rtm->rtm_tos      = 0;
        rtm->rtm_table    = mrt->id;
-        if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+        NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
-                goto nla_put_failure;
        rtm->rtm_type     = RTN_MULTICAST;
        rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
-        if (c->mfc_flags & MFC_STATIC)
+        rtm->rtm_protocol = RTPROT_UNSPEC;
-                rtm->rtm_protocol = RTPROT_STATIC;
-        else
-                rtm->rtm_protocol = RTPROT_MROUTED;
        rtm->rtm_flags    = 0;
-        if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) ||
+        NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
-            nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp))
+        NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
-                goto nla_put_failure;
-        err = __ipmr_fill_mroute(mrt, skb, c, rtm);
+        if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
-        /* do not break the dump if cache is unresolved */
-        if (err < 0 && err != -ENOENT)
                goto nla_put_failure;
        return nlmsg_end(skb, nlh);
@@ -2199,52 +2140,6 @@ nla_put_failure:
        return -EMSGSIZE;
 }
-static size_t mroute_msgsize(bool unresolved, int maxvif)
-{
-        size_t len =
-                NLMSG_ALIGN(sizeof(struct rtmsg))
-                + nla_total_size(4)     /* RTA_TABLE */
-                + nla_total_size(4)     /* RTA_SRC */
-                + nla_total_size(4)     /* RTA_DST */
-                ;
-        if (!unresolved)
-                len = len
-                      + nla_total_size(4)       /* RTA_IIF */
-                      + nla_total_size(0)       /* RTA_MULTIPATH */
-                      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
-                                                /* RTA_MFC_STATS */
-                      + nla_total_size(sizeof(struct rta_mfc_stats))
-                ;
-        return len;
-}
-static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
-                                 int cmd)
-{
-        struct net *net = read_pnet(&mrt->net);
-        struct sk_buff *skb;
-        int err = -ENOBUFS;
-        skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
-                        GFP_ATOMIC);
-        if (skb == NULL)
-                goto errout;
-        err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd);
-        if (err < 0)
-                goto errout;
-        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
-        return;
-errout:
-        kfree_skb(skb);
-        if (err < 0)
-                rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
-}
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
        struct net *net = sock_net(skb->sk);
@@ -2269,31 +2164,15 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
                                if (e < s_e)
                                        goto next_entry;
                                if (ipmr_fill_mroute(mrt, skb,
-                                                     NETLINK_CB(cb->skb).portid,
+                                                     NETLINK_CB(cb->skb).pid,
                                                     cb->nlh->nlmsg_seq,
-                                                     mfc, RTM_NEWROUTE) < 0)
+                                                     mfc) < 0)
                                        goto done;
 next_entry:
                                e++;
                        }
                        e = s_e = 0;
                }
-                spin_lock_bh(&mfc_unres_lock);
-                list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
-                        if (e < s_e)
-                                goto next_entry2;
-                        if (ipmr_fill_mroute(mrt, skb,
-                                             NETLINK_CB(cb->skb).portid,
-                                             cb->nlh->nlmsg_seq,
-                                             mfc, RTM_NEWROUTE) < 0) {
-                                spin_unlock_bh(&mfc_unres_lock);
-                                goto done;
-                        }
-next_entry2:
-                        e++;
-                }
-                spin_unlock_bh(&mfc_unres_lock);
-                e = s_e = 0;
                s_h = 0;
 next_table:
                t++;
@@ -2660,7 +2539,7 @@ int __init ip_mr_init(void)
                goto reg_notif_fail;
 #ifdef CONFIG_IP_PIMSM_V2
        if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
-                pr_err("%s: can't add PIM protocol\n", __func__);
+                printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
                err = -EAGAIN;
                goto add_proto_fail;
        }
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c0cf63dd92..929b27bdeb7 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -5,14 +5,13 @@
 #include <linux/ip.h>
 #include <linux/skbuff.h>
 #include <linux/gfp.h>
-#include <linux/export.h>
 #include <net/route.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_queue.h>
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 {
        struct net *net = dev_net(skb_dst(skb)->dev);
        const struct iphdr *iph = ip_hdr(skb);
@@ -64,14 +63,50 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
        /* Change in oif may mean change in hh_len. */
        hh_len = skb_dst(skb)->dev->hard_header_len;
        if (skb_headroom(skb) < hh_len &&
-            pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+            pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
-                                0, GFP_ATOMIC))
                return -1;
        return 0;
 }
 EXPORT_SYMBOL(ip_route_me_harder);
+#ifdef CONFIG_XFRM
+int ip_xfrm_me_harder(struct sk_buff *skb)
+{
+        struct flowi fl;
+        unsigned int hh_len;
+        struct dst_entry *dst;
+        if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
+                return 0;
+        if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
+                return -1;
+        dst = skb_dst(skb);
+        if (dst->xfrm)
+                dst = ((struct xfrm_dst *)dst)->route;
+        dst_hold(dst);
+        dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+        if (IS_ERR(dst))
+                return -1;
+        skb_dst_drop(skb);
+        skb_dst_set(skb, dst);
+        /* Change in oif may mean change in hh_len. */
+        hh_len = skb_dst(skb)->dev->hard_header_len;
+        if (skb_headroom(skb) < hh_len &&
+            pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(ip_xfrm_me_harder);
+#endif
+void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(ip_nat_decode_session);
 /*
 * Extra routing may needed on local out, as the QUEUE target never
 * returns control to the table.
@@ -188,15 +223,25 @@ static const struct nf_afinfo nf_ip_afinfo = {
        .route_key_size         = sizeof(struct ip_rt_info),
 };
-static int __init ipv4_netfilter_init(void)
+static int ipv4_netfilter_init(void)
 {
        return nf_register_afinfo(&nf_ip_afinfo);
 }
-static void __exit ipv4_netfilter_fini(void)
+static void ipv4_netfilter_fini(void)
 {
        nf_unregister_afinfo(&nf_ip_afinfo);
 }
 module_init(ipv4_netfilter_init);
 module_exit(ipv4_netfilter_fini);
+#ifdef CONFIG_SYSCTL
+struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
+        { .procname = "net", },
+        { .procname = "ipv4", },
+        { .procname = "netfilter", },
+        { }
+};
+EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
+#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d8d6f2a5bf1..73b4e91a87e 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -27,7 +27,7 @@ config NF_CONNTRACK_IPV4
 config NF_CONNTRACK_PROC_COMPAT
        bool "proc/sysctl compatibility with old connection tracking"
-        depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
+        depends on NF_CONNTRACK_IPV4
        default y
        help
          This option enables /proc and sysctl compatibility with the old
@@ -76,21 +76,11 @@ config IP_NF_MATCH_AH
 config IP_NF_MATCH_ECN
        tristate '"ecn" match support'
        depends on NETFILTER_ADVANCED
-        select NETFILTER_XT_MATCH_ECN
+        help
-        ---help---
+          This option adds a `ECN' match, which allows you to match against
-        This is a backwards-compat option for the user's convenience
+          the IPv4 and TCP header ECN fields.
-        (e.g. when running oldconfig). It selects
-        CONFIG_NETFILTER_XT_MATCH_ECN.
-config IP_NF_MATCH_RPFILTER
-        tristate '"rpfilter" reverse path filter match support'
-        depends on NETFILTER_ADVANCED
-        ---help---
-          This option allows you to match packets whose replies would
-          go out via the interface the packet came in.
          To compile it as a module, choose M here.  If unsure, say N.
-          The module will be called ipt_rpfilter.
 config IP_NF_MATCH_TTL
        tristate '"ttl" match support'
@@ -123,6 +113,27 @@ config IP_NF_TARGET_REJECT
          To compile it as a module, choose M here.  If unsure, say N.
+config IP_NF_TARGET_REJECT_SKERR
+        bool "Force socket error when rejecting with icmp*"
+        depends on IP_NF_TARGET_REJECT
+        default n
+        help
+          This option enables turning a "--reject-with icmp*" into a matching
+          socket error also.
+          The REJECT target normally allows sending an ICMP message. But it
+          leaves the local socket unaware of any ingress rejects.
+          If unsure, say N.
+config IP_NF_TARGET_LOG
+        tristate "LOG target support"
+        default m if NETFILTER_ADVANCED=n
+        help
+          This option adds a `LOG' target, which allows you to create rules in
+          any iptables table which records the packet header to the syslog.
+          To compile it as a module, choose M here.  If unsure, say N.
 config IP_NF_TARGET_ULOG
        tristate "ULOG target support"
        default m if NETFILTER_ADVANCED=n
@@ -143,22 +154,25 @@ config IP_NF_TARGET_ULOG
          To compile it as a module, choose M here.  If unsure, say N.
 # NAT + specific targets: nf_conntrack
-config NF_NAT_IPV4
+config NF_NAT
-        tristate "IPv4 NAT"
+        tristate "Full NAT"
        depends on NF_CONNTRACK_IPV4
        default m if NETFILTER_ADVANCED=n
-        select NF_NAT
        help
-          The IPv4 NAT option allows masquerading, port forwarding and other
+          The Full NAT option allows masquerading, port forwarding and other
          forms of full Network Address Port Translation.  It is controlled by
          the `nat' table in iptables: see the man page for iptables(8).
          To compile it as a module, choose M here.  If unsure, say N.
-if NF_NAT_IPV4
+config NF_NAT_NEEDED
+        bool
+        depends on NF_NAT
+        default y
 config IP_NF_TARGET_MASQUERADE
        tristate "MASQUERADE target support"
+        depends on NF_NAT
        default m if NETFILTER_ADVANCED=n
        help
          Masquerading is a special case of NAT: all outgoing connections are
@@ -171,27 +185,30 @@ config IP_NF_TARGET_MASQUERADE
 config IP_NF_TARGET_NETMAP
        tristate "NETMAP target support"
+        depends on NF_NAT
        depends on NETFILTER_ADVANCED
-        select NETFILTER_XT_TARGET_NETMAP
+        help
-        ---help---
+          NETMAP is an implementation of static 1:1 NAT mapping of network
-        This is a backwards-compat option for the user's convenience
+          addresses. It maps the network address part, while keeping the host
-        (e.g. when running oldconfig). It selects
+          address part intact.
-        CONFIG_NETFILTER_XT_TARGET_NETMAP.
+          To compile it as a module, choose M here.  If unsure, say N.
 config IP_NF_TARGET_REDIRECT
        tristate "REDIRECT target support"
+        depends on NF_NAT
        depends on NETFILTER_ADVANCED
-        select NETFILTER_XT_TARGET_REDIRECT
+        help
-        ---help---
+          REDIRECT is a special case of NAT: all incoming connections are
-        This is a backwards-compat option for the user's convenience
+          mapped onto the incoming interface's address, causing the packets to
-        (e.g. when running oldconfig). It selects
+          come to the local machine instead of passing through.  This is
-        CONFIG_NETFILTER_XT_TARGET_REDIRECT.
+          useful for transparent proxies.
-endif
+          To compile it as a module, choose M here.  If unsure, say N.
 config NF_NAT_SNMP_BASIC
        tristate "Basic SNMP-ALG support"
-        depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
+        depends on NF_CONNTRACK_SNMP && NF_NAT
        depends on NETFILTER_ADVANCED
        default NF_NAT && NF_CONNTRACK_SNMP
        ---help---
@@ -213,21 +230,61 @@ config NF_NAT_SNMP_BASIC
 #           <expr> '&&' <expr>                   (6)
 #
 # (6) Returns the result of min(/expr/, /expr/).
+config NF_NAT_PROTO_DCCP
+        tristate
+        depends on NF_NAT && NF_CT_PROTO_DCCP
+        default NF_NAT && NF_CT_PROTO_DCCP
 config NF_NAT_PROTO_GRE
        tristate
-        depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
+        depends on NF_NAT && NF_CT_PROTO_GRE
+config NF_NAT_PROTO_UDPLITE
+        tristate
+        depends on NF_NAT && NF_CT_PROTO_UDPLITE
+        default NF_NAT && NF_CT_PROTO_UDPLITE
+config NF_NAT_PROTO_SCTP
+        tristate
+        default NF_NAT && NF_CT_PROTO_SCTP
+        depends on NF_NAT && NF_CT_PROTO_SCTP
+        select LIBCRC32C
+config NF_NAT_FTP
+        tristate
+        depends on NF_CONNTRACK && NF_NAT
+        default NF_NAT && NF_CONNTRACK_FTP
+config NF_NAT_IRC
+        tristate
+        depends on NF_CONNTRACK && NF_NAT
+        default NF_NAT && NF_CONNTRACK_IRC
+config NF_NAT_TFTP
+        tristate
+        depends on NF_CONNTRACK && NF_NAT
+        default NF_NAT && NF_CONNTRACK_TFTP
+config NF_NAT_AMANDA
+        tristate
+        depends on NF_CONNTRACK && NF_NAT
+        default NF_NAT && NF_CONNTRACK_AMANDA
 config NF_NAT_PPTP
        tristate
-        depends on NF_CONNTRACK && NF_NAT_IPV4
+        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
+        default NF_NAT && NF_CONNTRACK_PPTP
        select NF_NAT_PROTO_GRE
 config NF_NAT_H323
        tristate
-        depends on NF_CONNTRACK && NF_NAT_IPV4
+        depends on NF_CONNTRACK && NF_NAT
-        default NF_NAT_IPV4 && NF_CONNTRACK_H323
+        default NF_NAT && NF_CONNTRACK_H323
+config NF_NAT_SIP
+        tristate
+        depends on NF_CONNTRACK && NF_NAT
+        default NF_NAT && NF_CONNTRACK_SIP
 # mangle + specific targets
 config IP_NF_MANGLE
@@ -280,6 +337,7 @@ config IP_NF_TARGET_TTL
 # raw + specific targets
 config IP_NF_RAW
        tristate  'raw table support (required for NOTRACK/TRACE)'
+        depends on NETFILTER_ADVANCED
        help
          This option adds a `raw' table to iptables. This table is the very
          first in the netfilter framework and hooks in at the PREROUTING
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 007b128eecc..dca2082ec68 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,22 +10,32 @@ nf_conntrack_ipv4-objs	+= nf_conntrack_l3proto_ipv4_compat.o
 endif
 endif
+nf_nat-y                := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
+iptable_nat-y   := nf_nat_rule.o nf_nat_standalone.o
 # connection tracking
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-nf_nat_ipv4-y           := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+obj-$(CONFIG_NF_NAT) += nf_nat.o
-obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 # defrag
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 # NAT helpers (nf_conntrack)
+obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
+obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
+obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
+obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 # NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
+obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -33,18 +43,21 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
 obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
+obj-$(CONFIG_NF_NAT) += iptable_nat.o
 obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
 # matches
 obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
-obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
+obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
 # targets
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
+obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
@@ -54,3 +67,6 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
 # just filtering instance of ARP tables for now
 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3ea4127404d..fd7a3f68917 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -221,8 +221,9 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
 static unsigned int
 arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
 {
-        net_err_ratelimited("arp_tables: error: '%s'\n",
+        if (net_ratelimit())
-                            (const char *)par->targinfo);
+                pr_err("arp_tables: error: '%s'\n",
+                       (const char *)par->targinfo);
        return NF_DROP;
 }
@@ -302,7 +303,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
                        if (v < 0) {
                                /* Pop from stack? */
                                if (v != XT_RETURN) {
-                                        verdict = (unsigned int)(-v) - 1;
+                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
                                e = back;
@@ -1533,7 +1534,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
@@ -1677,7 +1678,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
@@ -1698,7 +1699,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
@@ -1722,7 +1723,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 17c5e06da66..24e556e83a3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -153,7 +153,8 @@ ip_checkentry(const struct ipt_ip *ip)
 static unsigned int
 ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
 {
-        net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
+        if (net_ratelimit())
+                pr_info("error: `%s'\n", (const char *)par->targinfo);
        return NF_DROP;
 }
@@ -376,7 +377,7 @@ ipt_do_table(struct sk_buff *skb,
                        if (v < 0) {
                                /* Pop from stack? */
                                if (v != XT_RETURN) {
-                                        verdict = (unsigned int)(-v) - 1;
+                                        verdict = (unsigned)(-v) - 1;
                                        break;
                                }
                                if (*stackptr <= origptr) {
@@ -1846,7 +1847,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
@@ -1961,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
@@ -1983,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
@@ -2008,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
        int ret;
-        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+        if (!capable(CAP_NET_ADMIN))
                return -EPERM;
        switch (cmd) {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 75e33a7048f..db8d22db425 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -246,7 +246,8 @@ clusterip_hashfn(const struct sk_buff *skb,
                        dport = ports[1];
                }
        } else {
-                net_info_ratelimited("unknown protocol %u\n", iph->protocol);
+                if (net_ratelimit())
+                        pr_info("unknown protocol %u\n", iph->protocol);
        }
        switch (config->hash_mode) {
@@ -394,6 +395,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
                        config = clusterip_config_init(cipinfo,
                                                        e->ip.dst.s_addr, dev);
                        if (!config) {
+                                pr_info("cannot allocate config\n");
                                dev_put(dev);
                                return -ENOMEM;
                        }
@@ -661,7 +663,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
 #define PROC_WRITELEN   10
        char buffer[PROC_WRITELEN+1];
        unsigned long nodenum;
-        int rc;
        if (size > PROC_WRITELEN)
                return -EIO;
@@ -670,15 +671,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
        buffer[size] = 0;
        if (*buffer == '+') {
-                rc = kstrtoul(buffer+1, 10, &nodenum);
+                nodenum = simple_strtoul(buffer+1, NULL, 10);
-                if (rc)
-                        return rc;
                if (clusterip_add_node(c, nodenum))
                        return -ENOMEM;
        } else if (*buffer == '-') {
-                rc = kstrtoul(buffer+1, 10, &nodenum);
+                nodenum = simple_strtoul(buffer+1, NULL,10);
-                if (rc)
-                        return rc;
                if (clusterip_del_node(c, nodenum))
                        return -ENOENT;
        } else
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 5d5d4d1be9c..9931152a78b 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
 #include <net/ip.h>
 #include <net/checksum.h>
 #include <net/route.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -30,9 +30,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
 /* FIXME: Multiple targets. --RR */
 static int masquerade_tg_check(const struct xt_tgchk_param *par)
 {
-        const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+        const struct nf_nat_multi_range_compat *mr = par->targinfo;
-        if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
+        if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
                pr_debug("bad MAP_IPS.\n");
                return -EINVAL;
        }
@@ -50,9 +50,9 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        struct nf_conn_nat *nat;
        enum ip_conntrack_info ctinfo;
        struct nf_nat_range newrange;
-        const struct nf_nat_ipv4_multi_range_compat *mr;
+        const struct nf_nat_multi_range_compat *mr;
        const struct rtable *rt;
-        __be32 newsrc, nh;
+        __be32 newsrc;
        NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
@@ -70,8 +70,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        mr = par->targinfo;
        rt = skb_rtable(skb);
-        nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+        newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
-        newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
        if (!newsrc) {
                pr_info("%s ate my IP address\n", par->out->name);
                return NF_DROP;
@@ -80,16 +79,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
        nat->masq_index = par->out->ifindex;
        /* Transfer from original range. */
-        memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+        newrange = ((struct nf_nat_range)
-        memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+                { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
-        newrange.flags       = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+                  newsrc, newsrc,
-        newrange.min_addr.ip = newsrc;
+                  mr->range[0].min, mr->range[0].max });
-        newrange.max_addr.ip = newsrc;
-        newrange.min_proto   = mr->range[0].min;
-        newrange.max_proto   = mr->range[0].max;
        /* Hand modified range to generic setup. */
-        return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+        return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
 }
 static int
@@ -99,8 +95,7 @@ device_cmp(struct nf_conn *i, void *ifindex)
        if (!nat)
                return 0;
-        if (nf_ct_l3num(i) != NFPROTO_IPV4)
-                return 0;
        return nat->masq_index == (int)(long)ifindex;
 }
@@ -144,7 +139,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
        .name           = "MASQUERADE",
        .family         = NFPROTO_IPV4,
        .target         = masquerade_tg,
-        .targetsize     = sizeof(struct nf_nat_ipv4_multi_range_compat),
+        .targetsize     = sizeof(struct nf_nat_multi_range_compat),
        .table          = "nat",
        .hooks          = 1 << NF_INET_POST_ROUTING,
        .checkentry     = masquerade_tg_check,
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 04b18c1ac34..9dd754c7f2b 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -81,7 +81,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
        niph->saddr     = oiph->daddr;
        niph->daddr     = oiph->saddr;
-        skb_reset_transport_header(nskb);
        tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
        memset(tcph, 0, sizeof(*tcph));
        tcph->source    = oth->dest;
@@ -129,6 +128,14 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 static inline void send_unreach(struct sk_buff *skb_in, int code)
 {
        icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+#ifdef CONFIG_IP_NF_TARGET_REJECT_SKERR
+        if (skb_in->sk) {
+                skb_in->sk->sk_err = icmp_err_convert[code].errno;
+                skb_in->sk->sk_error_report(skb_in->sk);
+                pr_debug("ipt_REJECT: sk_err=%d for skb=%p sk=%p\n",
+                        skb_in->sk->sk_err, skb_in, skb_in->sk);
+        }
+#endif
 }
 static unsigned int
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b5ef3cba225..446e0f467a1 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -65,7 +65,7 @@ static unsigned int flushtimeout = 10;
 module_param(flushtimeout, uint, 0600);
 MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
-static bool nflog = true;
+static int nflog = 1;
 module_param(nflog, bool, 0400);
 MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
@@ -135,8 +135,10 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
         * due to slab allocator restrictions */
        n = max(size, nlbufsiz);
-        skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
+        skb = alloc_skb(n, GFP_ATOMIC);
        if (!skb) {
+                pr_debug("cannot alloc whole buffer %ub!\n", n);
                if (n > size) {
                        /* try to allocate only as much as we need for
                         * current packet */
@@ -196,15 +198,12 @@ static void ipt_ulog_packet(unsigned int hooknum,
        pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
-        nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+        /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
-                        sizeof(*pm)+copy_len, 0);
+        nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
-        if (!nlh) {
+                        sizeof(*pm)+copy_len);
-                pr_debug("error during nlmsg_put\n");
-                goto out_unlock;
-        }
        ub->qlen++;
-        pm = nlmsg_data(nlh);
+        pm = NLMSG_DATA(nlh);
        /* We might not have a timestamp, get one */
        if (skb->tstamp.tv64 == 0)
@@ -264,11 +263,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
                        nlh->nlmsg_type = NLMSG_DONE;
                ulog_send(groupnum);
        }
-out_unlock:
        spin_unlock_bh(&ulog_lock);
        return;
+nlmsg_failure:
+        pr_debug("error during NLMSG_PUT\n");
 alloc_failure:
        pr_debug("Error building netlink message\n");
        spin_unlock_bh(&ulog_lock);
@@ -381,9 +382,6 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
 static int __init ulog_tg_init(void)
 {
        int ret, i;
-        struct netlink_kernel_cfg cfg = {
-                .groups = ULOG_MAXNLGROUPS,
-        };
        pr_debug("init module\n");
@@ -396,7 +394,9 @@ static int __init ulog_tg_init(void)
        for (i = 0; i < ULOG_MAXNLGROUPS; i++)
                setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
-        nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg);
+        nflognl = netlink_kernel_create(&init_net,
+                                        NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+                                        NULL, THIS_MODULE);
        if (!nflognl)
                return -ENOMEM;
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
deleted file mode 100644
index c30130062cd..00000000000
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <net/ip_fib.h>
-#include <net/route.h>
-#include <linux/netfilter/xt_rpfilter.h>
-#include <linux/netfilter/x_tables.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
-MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
-/* don't try to find route from mcast/bcast/zeronet */
-static __be32 rpfilter_get_saddr(__be32 addr)
-{
-        if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
-            ipv4_is_zeronet(addr))
-                return 0;
-        return addr;
-}
-static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
-                                const struct net_device *dev, u8 flags)
-{
-        struct fib_result res;
-        bool dev_match;
-        struct net *net = dev_net(dev);
-        int ret __maybe_unused;
-        if (fib_lookup(net, fl4, &res))
-                return false;
-        if (res.type != RTN_UNICAST) {
-                if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
-                        return false;
-        }
-        dev_match = false;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-        for (ret = 0; ret < res.fi->fib_nhs; ret++) {
-                struct fib_nh *nh = &res.fi->fib_nh[ret];
-                if (nh->nh_dev == dev) {
-                        dev_match = true;
-                        break;
-                }
-        }
-#else
-        if (FIB_RES_DEV(res) == dev)
-                dev_match = true;
-#endif
-        if (dev_match || flags & XT_RPFILTER_LOOSE)
-                return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
-        return dev_match;
-}
-static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
-{
-        const struct xt_rpfilter_info *info;
-        const struct iphdr *iph;
-        struct flowi4 flow;
-        bool invert;
-        info = par->matchinfo;
-        invert = info->flags & XT_RPFILTER_INVERT;
-        if (par->in->flags & IFF_LOOPBACK)
-                return true ^ invert;
-        iph = ip_hdr(skb);
-        if (ipv4_is_multicast(iph->daddr)) {
-                if (ipv4_is_zeronet(iph->saddr))
-                        return ipv4_is_local_multicast(iph->daddr) ^ invert;
-                flow.flowi4_iif = 0;
-        } else {
-                flow.flowi4_iif = LOOPBACK_IFINDEX;
-        }
-        flow.daddr = iph->saddr;
-        flow.saddr = rpfilter_get_saddr(iph->daddr);
-        flow.flowi4_oif = 0;
-        flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
-        flow.flowi4_tos = RT_TOS(iph->tos);
-        flow.flowi4_scope = RT_SCOPE_UNIVERSE;
-        return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
-}
-static int rpfilter_check(const struct xt_mtchk_param *par)
-{
-        const struct xt_rpfilter_info *info = par->matchinfo;
-        unsigned int options = ~XT_RPFILTER_OPTION_MASK;
-        if (info->flags & options) {
-                pr_info("unknown options encountered");
-                return -EINVAL;
-        }
-        if (strcmp(par->table, "mangle") != 0 &&
-            strcmp(par->table, "raw") != 0) {
-                pr_info("match only valid in the \'raw\' "
-                        "or \'mangle\' tables, not \'%s\'.\n", par->table);
-                return -EINVAL;
-        }
-        return 0;
-}
-static struct xt_match rpfilter_mt_reg __read_mostly = {
-        .name           = "rpfilter",
-        .family         = NFPROTO_IPV4,
-        .checkentry     = rpfilter_check,
-        .match          = rpfilter_mt,
-        .matchsize      = sizeof(struct xt_rpfilter_info),
-        .hooks          = (1 << NF_INET_PRE_ROUTING),
-        .me             = THIS_MODULE
-};
-static int __init rpfilter_mt_init(void)
-{
-        return xt_register_match(&rpfilter_mt_reg);
-}
-static void __exit rpfilter_mt_exit(void)
-{
-        xt_unregister_match(&rpfilter_mt_reg);
-}
-module_init(rpfilter_mt_init);
-module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 6b3da5cf54e..c37641e819f 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -52,7 +52,7 @@ iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
 static struct nf_hook_ops *filter_ops __read_mostly;
 /* Default to forward because I got too much mail already. */
-static bool forward = true;
+static int forward = NF_ACCEPT;
 module_param(forward, bool, 0000);
 static int __net_init iptable_filter_net_init(struct net *net)
@@ -64,12 +64,14 @@ static int __net_init iptable_filter_net_init(struct net *net)
                return -ENOMEM;
        /* Entry 1 is the FORWARD hook */
        ((struct ipt_standard *)repl->entries)[1].target.verdict =
-                forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+                -forward - 1;
        net->ipv4.iptable_filter =
                ipt_register_table(net, &packet_filter, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_filter);
+        if (IS_ERR(net->ipv4.iptable_filter))
+                return PTR_ERR(net->ipv4.iptable_filter);
+        return 0;
 }
 static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -86,6 +88,11 @@ static int __init iptable_filter_init(void)
 {
        int ret;
+        if (forward < 0 || forward > NF_MAX_VERDICT) {
+                pr_err("iptables forward must be 0 or 1\n");
+                return -EINVAL;
+        }
        ret = register_pernet_subsys(&iptable_filter_net_ops);
        if (ret < 0)
                return ret;
@@ -94,10 +101,14 @@ static int __init iptable_filter_init(void)
        filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
        if (IS_ERR(filter_ops)) {
                ret = PTR_ERR(filter_ops);
-                unregister_pernet_subsys(&iptable_filter_net_ops);
+                goto cleanup_table;
        }
        return ret;
+ cleanup_table:
+        unregister_pernet_subsys(&iptable_filter_net_ops);
+        return ret;
 }
 static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 85d88f20644..aef5d1fbe77 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -104,7 +104,9 @@ static int __net_init iptable_mangle_net_init(struct net *net)
        net->ipv4.iptable_mangle =
                ipt_register_table(net, &packet_mangler, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_mangle);
+        if (IS_ERR(net->ipv4.iptable_mangle))
+                return PTR_ERR(net->ipv4.iptable_mangle);
+        return 0;
 }
 static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -129,10 +131,14 @@ static int __init iptable_mangle_init(void)
        mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
        if (IS_ERR(mangle_ops)) {
                ret = PTR_ERR(mangle_ops);
-                unregister_pernet_subsys(&iptable_mangle_net_ops);
+                goto cleanup_table;
        }
        return ret;
+ cleanup_table:
+        unregister_pernet_subsys(&iptable_mangle_net_ops);
+        return ret;
 }
 static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
deleted file mode 100644
index eeaff7e4acb..00000000000
--- a/net/ipv4/netfilter/iptable_nat.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/ip.h>
-#include <net/ip.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-static const struct xt_table nf_nat_ipv4_table = {
-        .name           = "nat",
-        .valid_hooks    = (1 << NF_INET_PRE_ROUTING) |
-                          (1 << NF_INET_POST_ROUTING) |
-                          (1 << NF_INET_LOCAL_OUT) |
-                          (1 << NF_INET_LOCAL_IN),
-        .me             = THIS_MODULE,
-        .af             = NFPROTO_IPV4,
-};
-static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
-{
-        /* Force range to this IP; let proto decide mapping for
-         * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
-         */
-        struct nf_nat_range range;
-        range.flags = 0;
-        pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
-                 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
-                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
-                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-        return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
-}
-static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
-                                     const struct net_device *in,
-                                     const struct net_device *out,
-                                     struct nf_conn *ct)
-{
-        struct net *net = nf_ct_net(ct);
-        unsigned int ret;
-        ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
-        if (ret == NF_ACCEPT) {
-                if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
-                        ret = alloc_null_binding(ct, hooknum);
-        }
-        return ret;
-}
-static unsigned int
-nf_nat_ipv4_fn(unsigned int hooknum,
-               struct sk_buff *skb,
-               const struct net_device *in,
-               const struct net_device *out,
-               int (*okfn)(struct sk_buff *))
-{
-        struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        struct nf_conn_nat *nat;
-        /* maniptype == SRC for postrouting. */
-        enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
-        /* We never see fragments: conntrack defrags on pre-routing
-         * and local-out, and nf_nat_out protects post-routing.
-         */
-        NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
-        ct = nf_ct_get(skb, &ctinfo);
-        /* Can't track?  It's not due to stress, or conntrack would
-         * have dropped it.  Hence it's the user's responsibilty to
-         * packet filter it out, or implement conntrack/NAT for that
-         * protocol. 8) --RR
-         */
-        if (!ct)
-                return NF_ACCEPT;
-        /* Don't try to NAT if this packet is not conntracked */
-        if (nf_ct_is_untracked(ct))
-                return NF_ACCEPT;
-        nat = nfct_nat(ct);
-        if (!nat) {
-                /* NAT module was loaded late. */
-                if (nf_ct_is_confirmed(ct))
-                        return NF_ACCEPT;
-                nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
-                if (nat == NULL) {
-                        pr_debug("failed to add NAT extension\n");
-                        return NF_ACCEPT;
-                }
-        }
-        switch (ctinfo) {
-        case IP_CT_RELATED:
-        case IP_CT_RELATED_REPLY:
-                if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-                        if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-                                                           hooknum))
-                                return NF_DROP;
-                        else
-                                return NF_ACCEPT;
-                }
-                /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
-        case IP_CT_NEW:
-                /* Seen it before?  This can happen for loopback, retrans,
-                 * or local packets.
-                 */
-                if (!nf_nat_initialized(ct, maniptype)) {
-                        unsigned int ret;
-                        ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
-                        if (ret != NF_ACCEPT)
-                                return ret;
-                } else {
-                        pr_debug("Already setup manip %s for ct %p\n",
-                                 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
-                                 ct);
-                        if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
-                                goto oif_changed;
-                }
-                break;
-        default:
-                /* ESTABLISHED */
-                NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-                             ctinfo == IP_CT_ESTABLISHED_REPLY);
-                if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
-                        goto oif_changed;
-        }
-        return nf_nat_packet(ct, ctinfo, hooknum, skb);
-oif_changed:
-        nf_ct_kill_acct(ct, ctinfo, skb);
-        return NF_DROP;
-}
-static unsigned int
-nf_nat_ipv4_in(unsigned int hooknum,
-               struct sk_buff *skb,
-               const struct net_device *in,
-               const struct net_device *out,
-               int (*okfn)(struct sk_buff *))
-{
-        unsigned int ret;
-        __be32 daddr = ip_hdr(skb)->daddr;
-        ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
-        if (ret != NF_DROP && ret != NF_STOLEN &&
-            daddr != ip_hdr(skb)->daddr)
-                skb_dst_drop(skb);
-        return ret;
-}
-static unsigned int
-nf_nat_ipv4_out(unsigned int hooknum,
-                struct sk_buff *skb,
-                const struct net_device *in,
-                const struct net_device *out,
-                int (*okfn)(struct sk_buff *))
-{
-#ifdef CONFIG_XFRM
-        const struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-#endif
-        unsigned int ret;
-        /* root is playing with raw sockets. */
-        if (skb->len < sizeof(struct iphdr) ||
-            ip_hdrlen(skb) < sizeof(struct iphdr))
-                return NF_ACCEPT;
-        ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
-#ifdef CONFIG_XFRM
-        if (ret != NF_DROP && ret != NF_STOLEN &&
-            !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
-            (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-                if ((ct->tuplehash[dir].tuple.src.u3.ip !=
-                     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
-                    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
-                     ct->tuplehash[dir].tuple.src.u.all !=
-                     ct->tuplehash[!dir].tuple.dst.u.all))
-                        if (nf_xfrm_me_harder(skb, AF_INET) < 0)
-                                ret = NF_DROP;
-        }
-#endif
-        return ret;
-}
-static unsigned int
-nf_nat_ipv4_local_fn(unsigned int hooknum,
-                     struct sk_buff *skb,
-                     const struct net_device *in,
-                     const struct net_device *out,
-                     int (*okfn)(struct sk_buff *))
-{
-        const struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        unsigned int ret;
-        /* root is playing with raw sockets. */
-        if (skb->len < sizeof(struct iphdr) ||
-            ip_hdrlen(skb) < sizeof(struct iphdr))
-                return NF_ACCEPT;
-        ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
-        if (ret != NF_DROP && ret != NF_STOLEN &&
-            (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
-                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-                if (ct->tuplehash[dir].tuple.dst.u3.ip !=
-                    ct->tuplehash[!dir].tuple.src.u3.ip) {
-                        if (ip_route_me_harder(skb, RTN_UNSPEC))
-                                ret = NF_DROP;
-                }
-#ifdef CONFIG_XFRM
-                else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
-                         ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
-                         ct->tuplehash[dir].tuple.dst.u.all !=
-                         ct->tuplehash[!dir].tuple.src.u.all)
-                        if (nf_xfrm_me_harder(skb, AF_INET) < 0)
-                                ret = NF_DROP;
-#endif
-        }
-        return ret;
-}
-static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
-        /* Before packet filtering, change destination */
-        {
-                .hook           = nf_nat_ipv4_in,
-                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
-                .hooknum        = NF_INET_PRE_ROUTING,
-                .priority       = NF_IP_PRI_NAT_DST,
-        },
-        /* After packet filtering, change source */
-        {
-                .hook           = nf_nat_ipv4_out,
-                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
-                .hooknum        = NF_INET_POST_ROUTING,
-                .priority       = NF_IP_PRI_NAT_SRC,
-        },
-        /* Before packet filtering, change destination */
-        {
-                .hook           = nf_nat_ipv4_local_fn,
-                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
-                .hooknum        = NF_INET_LOCAL_OUT,
-                .priority       = NF_IP_PRI_NAT_DST,
-        },
-        /* After packet filtering, change source */
-        {
-                .hook           = nf_nat_ipv4_fn,
-                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
-                .hooknum        = NF_INET_LOCAL_IN,
-                .priority       = NF_IP_PRI_NAT_SRC,
-        },
-};
-static int __net_init iptable_nat_net_init(struct net *net)
-{
-        struct ipt_replace *repl;
-        repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
-        if (repl == NULL)
-                return -ENOMEM;
-        net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
-        kfree(repl);
-        return PTR_RET(net->ipv4.nat_table);
-}
-static void __net_exit iptable_nat_net_exit(struct net *net)
-{
-        ipt_unregister_table(net, net->ipv4.nat_table);
-}
-static struct pernet_operations iptable_nat_net_ops = {
-        .init   = iptable_nat_net_init,
-        .exit   = iptable_nat_net_exit,
-};
-static int __init iptable_nat_init(void)
-{
-        int err;
-        err = register_pernet_subsys(&iptable_nat_net_ops);
-        if (err < 0)
-                goto err1;
-        err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-        if (err < 0)
-                goto err2;
-        return 0;
-err2:
-        unregister_pernet_subsys(&iptable_nat_net_ops);
-err1:
-        return err;
-}
-static void __exit iptable_nat_exit(void)
-{
-        nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-        unregister_pernet_subsys(&iptable_nat_net_ops);
-}
-module_init(iptable_nat_init);
-module_exit(iptable_nat_exit);
-MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 03d9696d3c6..07fb710cd72 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,7 +48,9 @@ static int __net_init iptable_raw_net_init(struct net *net)
        net->ipv4.iptable_raw =
                ipt_register_table(net, &packet_raw, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_raw);
+        if (IS_ERR(net->ipv4.iptable_raw))
+                return PTR_ERR(net->ipv4.iptable_raw);
+        return 0;
 }
 static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -73,10 +75,14 @@ static int __init iptable_raw_init(void)
        rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
        if (IS_ERR(rawtable_ops)) {
                ret = PTR_ERR(rawtable_ops);
-                unregister_pernet_subsys(&iptable_raw_net_ops);
+                goto cleanup_table;
        }
        return ret;
+ cleanup_table:
+        unregister_pernet_subsys(&iptable_raw_net_ops);
+        return ret;
 }
 static void __exit iptable_raw_fini(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index b283d8e2601..be45bdc4c60 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,7 +66,10 @@ static int __net_init iptable_security_net_init(struct net *net)
        net->ipv4.iptable_security =
                ipt_register_table(net, &security_table, repl);
        kfree(repl);
-        return PTR_RET(net->ipv4.iptable_security);
+        if (IS_ERR(net->ipv4.iptable_security))
+                return PTR_ERR(net->ipv4.iptable_security);
+        return 0;
 }
 static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcdd0c2406e..de9da21113a 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -29,6 +29,11 @@
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 #include <net/netfilter/nf_log.h>
+int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
+                              struct nf_conn *ct,
+                              enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
 static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
                              struct nf_conntrack_tuple *tuple)
 {
@@ -69,32 +74,24 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
        iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
        if (iph == NULL)
-                return -NF_ACCEPT;
+                return -NF_DROP;
        /* Conntrack defragments packets, we might still see fragments
         * inside ICMP packets though. */
        if (iph->frag_off & htons(IP_OFFSET))
-                return -NF_ACCEPT;
+                return -NF_DROP;
        *dataoff = nhoff + (iph->ihl << 2);
        *protonum = iph->protocol;
-        /* Check bogus IP headers */
-        if (*dataoff > skb->len) {
-                pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
-                         "nhoff %u, ihl %u, skblen %u\n",
-                         nhoff, iph->ihl << 2, skb->len);
-                return -NF_ACCEPT;
-        }
        return NF_ACCEPT;
 }
-static unsigned int ipv4_helper(unsigned int hooknum,
+static unsigned int ipv4_confirm(unsigned int hooknum,
-                                struct sk_buff *skb,
+                                 struct sk_buff *skb,
-                                const struct net_device *in,
+                                 const struct net_device *in,
-                                const struct net_device *out,
+                                 const struct net_device *out,
-                                int (*okfn)(struct sk_buff *))
+                                 int (*okfn)(struct sk_buff *))
 {
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
@@ -105,38 +102,24 @@ static unsigned int ipv4_helper(unsigned int hooknum,
        /* This is where we call the helper: as the packet goes out. */
        ct = nf_ct_get(skb, &ctinfo);
        if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-                return NF_ACCEPT;
+                goto out;
        help = nfct_help(ct);
        if (!help)
-                return NF_ACCEPT;
+                goto out;
        /* rcu_read_lock()ed by nf_hook_slow */
        helper = rcu_dereference(help->helper);
        if (!helper)
-                return NF_ACCEPT;
+                goto out;
        ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
                           ct, ctinfo);
-        if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) {
+        if (ret != NF_ACCEPT) {
                nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
                              "nf_ct_%s: dropping packet", helper->name);
+                return ret;
        }
-        return ret;
-}
-static unsigned int ipv4_confirm(unsigned int hooknum,
-                                 struct sk_buff *skb,
-                                 const struct net_device *in,
-                                 const struct net_device *out,
-                                 int (*okfn)(struct sk_buff *))
-{
-        struct nf_conn *ct;
-        enum ip_conntrack_info ctinfo;
-        ct = nf_ct_get(skb, &ctinfo);
-        if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-                goto out;
        /* adjust seqs for loopback traffic only in outgoing direction */
        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -144,8 +127,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
                typeof(nf_nat_seq_adjust_hook) seq_adjust;
                seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-                if (!seq_adjust ||
+                if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
-                    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
                        NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
                        return NF_DROP;
                }
@@ -195,13 +177,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
                .priority       = NF_IP_PRI_CONNTRACK,
        },
        {
-                .hook           = ipv4_helper,
-                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
-                .hooknum        = NF_INET_POST_ROUTING,
-                .priority       = NF_IP_PRI_CONNTRACK_HELPER,
-        },
-        {
                .hook           = ipv4_confirm,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
@@ -209,13 +184,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
                .priority       = NF_IP_PRI_CONNTRACK_CONFIRM,
        },
        {
-                .hook           = ipv4_helper,
-                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
-                .hooknum        = NF_INET_LOCAL_IN,
-                .priority       = NF_IP_PRI_CONNTRACK_HELPER,
-        },
-        {
                .hook           = ipv4_confirm,
                .owner          = THIS_MODULE,
                .pf             = NFPROTO_IPV4,
@@ -231,30 +199,35 @@ static int log_invalid_proto_max = 255;
 static ctl_table ip_ct_sysctl_table[] = {
        {
                .procname       = "ip_conntrack_max",
+                .data           = &nf_conntrack_max,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_count",
+                .data           = &init_net.ct.count,
                .maxlen         = sizeof(int),
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_buckets",
+                .data           = &init_net.ct.htable_size,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_checksum",
+                .data           = &init_net.ct.sysctl_checksum,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "ip_conntrack_log_invalid",
+                .data           = &init_net.ct.sysctl_log_invalid,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
@@ -330,9 +303,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
                                const struct nf_conntrack_tuple *tuple)
 {
-        if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+        NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
-            nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+        NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
-                goto nla_put_failure;
        return 0;
 nla_put_failure:
@@ -370,25 +342,6 @@ static struct nf_sockopt_ops so_getorigdst = {
        .owner          = THIS_MODULE,
 };
-static int ipv4_init_net(struct net *net)
-{
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-        struct nf_ip_net *in = &net->ct.nf_ct_proto;
-        in->ctl_table = kmemdup(ip_ct_sysctl_table,
-                                sizeof(ip_ct_sysctl_table),
-                                GFP_KERNEL);
-        if (!in->ctl_table)
-                return -ENOMEM;
-        in->ctl_table[0].data = &nf_conntrack_max;
-        in->ctl_table[1].data = &net->ct.count;
-        in->ctl_table[2].data = &net->ct.htable_size;
-        in->ctl_table[3].data = &net->ct.sysctl_checksum;
-        in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
-#endif
-        return 0;
-}
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
        .l3proto         = PF_INET,
        .name            = "ipv4",
@@ -403,9 +356,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
        .nla_policy      = ipv4_nla_policy,
 #endif
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
-        .ctl_table_path  = "net/ipv4/netfilter",
+        .ctl_table_path  = nf_net_ipv4_netfilter_sysctl_path,
+        .ctl_table       = ip_ct_sysctl_table,
 #endif
-        .init_net        = ipv4_init_net,
        .me              = THIS_MODULE,
 };
@@ -416,65 +369,6 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
 MODULE_ALIAS("ip_conntrack");
 MODULE_LICENSE("GPL");
-static int ipv4_net_init(struct net *net)
-{
-        int ret = 0;
-        ret = nf_conntrack_l4proto_register(net,
-                                            &nf_conntrack_l4proto_tcp4);
-        if (ret < 0) {
-                pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
-                goto out_tcp;
-        }
-        ret = nf_conntrack_l4proto_register(net,
-                                            &nf_conntrack_l4proto_udp4);
-        if (ret < 0) {
-                pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
-                goto out_udp;
-        }
-        ret = nf_conntrack_l4proto_register(net,
-                                            &nf_conntrack_l4proto_icmp);
-        if (ret < 0) {
-                pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
-                goto out_icmp;
-        }
-        ret = nf_conntrack_l3proto_register(net,
-                                            &nf_conntrack_l3proto_ipv4);
-        if (ret < 0) {
-                pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
-                goto out_ipv4;
-        }
-        return 0;
-out_ipv4:
-        nf_conntrack_l4proto_unregister(net,
-                                        &nf_conntrack_l4proto_icmp);
-out_icmp:
-        nf_conntrack_l4proto_unregister(net,
-                                        &nf_conntrack_l4proto_udp4);
-out_udp:
-        nf_conntrack_l4proto_unregister(net,
-                                        &nf_conntrack_l4proto_tcp4);
-out_tcp:
-        return ret;
-}
-static void ipv4_net_exit(struct net *net)
-{
-        nf_conntrack_l3proto_unregister(net,
-                                        &nf_conntrack_l3proto_ipv4);
-        nf_conntrack_l4proto_unregister(net,
-                                        &nf_conntrack_l4proto_icmp);
-        nf_conntrack_l4proto_unregister(net,
-                                        &nf_conntrack_l4proto_udp4);
-        nf_conntrack_l4proto_unregister(net,
-                                        &nf_conntrack_l4proto_tcp4);
-}
-static struct pernet_operations ipv4_net_ops = {
-        .init = ipv4_net_init,
-        .exit = ipv4_net_exit,
-};
 static int __init nf_conntrack_l3proto_ipv4_init(void)
 {
        int ret = 0;
@@ -488,17 +382,35 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
                return ret;
        }
-        ret = register_pernet_subsys(&ipv4_net_ops);
+        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
        if (ret < 0) {
-                pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
+                pr_err("nf_conntrack_ipv4: can't register tcp.\n");
                goto cleanup_sockopt;
        }
+        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
+        if (ret < 0) {
+                pr_err("nf_conntrack_ipv4: can't register udp.\n");
+                goto cleanup_tcp;
+        }
+        ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
+        if (ret < 0) {
+                pr_err("nf_conntrack_ipv4: can't register icmp.\n");
+                goto cleanup_udp;
+        }
+        ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
+        if (ret < 0) {
+                pr_err("nf_conntrack_ipv4: can't register ipv4\n");
+                goto cleanup_icmp;
+        }
        ret = nf_register_hooks(ipv4_conntrack_ops,
                                ARRAY_SIZE(ipv4_conntrack_ops));
        if (ret < 0) {
                pr_err("nf_conntrack_ipv4: can't register hooks.\n");
-                goto cleanup_pernet;
+                goto cleanup_ipv4;
        }
 #if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
        ret = nf_conntrack_ipv4_compat_init();
@@ -510,8 +422,14 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 cleanup_hooks:
        nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
 #endif
- cleanup_pernet:
+ cleanup_ipv4:
-        unregister_pernet_subsys(&ipv4_net_ops);
+        nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ cleanup_icmp:
+        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp:
+        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp:
+        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
 cleanup_sockopt:
        nf_unregister_sockopt(&so_getorigdst);
        return ret;
@@ -524,7 +442,10 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
        nf_conntrack_ipv4_compat_fini();
 #endif
        nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
-        unregister_pernet_subsys(&ipv4_net_ops);
+        nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+        nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
        nf_unregister_sockopt(&so_getorigdst);
 }
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 9682b36df38..5585980fce2 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -21,7 +21,6 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_acct.h>
 #include <linux/rculist_nulls.h>
-#include <linux/export.h>
 struct ct_iter_state {
        struct seq_net_private p;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5241d997ab7..ab5b27a2916 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,11 +23,6 @@
 static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
-static inline struct nf_icmp_net *icmp_pernet(struct net *net)
-{
-        return &net->ct.nf_ct_proto.icmp;
-}
 static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
                              struct nf_conntrack_tuple *tuple)
 {
@@ -80,31 +75,25 @@ static int icmp_print_tuple(struct seq_file *s,
                          ntohs(tuple->src.u.icmp.id));
 }
-static unsigned int *icmp_get_timeouts(struct net *net)
-{
-        return &icmp_pernet(net)->timeout;
-}
 /* Returns verdict for packet, or -1 for invalid. */
 static int icmp_packet(struct nf_conn *ct,
                       const struct sk_buff *skb,
                       unsigned int dataoff,
                       enum ip_conntrack_info ctinfo,
                       u_int8_t pf,
-                       unsigned int hooknum,
+                       unsigned int hooknum)
-                       unsigned int *timeout)
 {
        /* Do not immediately delete the connection after the first
           successful reply to avoid excessive conntrackd traffic
           and also to handle correctly ICMP echo reply duplicates. */
-        nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+        nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
        return NF_ACCEPT;
 }
 /* Called when a new connection for this protocol found. */
 static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
-                     unsigned int dataoff, unsigned int *timeouts)
+                     unsigned int dataoff)
 {
        static const u_int8_t valid_new[] = {
                [ICMP_ECHO] = 1,
@@ -233,10 +222,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 static int icmp_tuple_to_nlattr(struct sk_buff *skb,
                                const struct nf_conntrack_tuple *t)
 {
-        if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
+        NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
-            nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
+        NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
-            nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
+        NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
-                goto nla_put_failure;
        return 0;
 nla_put_failure:
@@ -274,50 +263,12 @@ static int icmp_nlattr_tuple_size(void)
 }
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
-                                      struct net *net, void *data)
-{
-        unsigned int *timeout = data;
-        struct nf_icmp_net *in = icmp_pernet(net);
-        if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
-                *timeout =
-                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
-        } else {
-                /* Set default ICMP timeout. */
-                *timeout = in->timeout;
-        }
-        return 0;
-}
-static int
-icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
-        const unsigned int *timeout = data;
-        if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
-                goto nla_put_failure;
-        return 0;
-nla_put_failure:
-        return -ENOSPC;
-}
-static const struct nla_policy
-icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
-        [CTA_TIMEOUT_ICMP_TIMEOUT]      = { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 #ifdef CONFIG_SYSCTL
+static struct ctl_table_header *icmp_sysctl_header;
 static struct ctl_table icmp_sysctl_table[] = {
        {
                .procname       = "nf_conntrack_icmp_timeout",
+                .data           = &nf_ct_icmp_timeout,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
@@ -328,6 +279,7 @@ static struct ctl_table icmp_sysctl_table[] = {
 static struct ctl_table icmp_compat_sysctl_table[] = {
        {
                .procname       = "ip_conntrack_icmp_timeout",
+                .data           = &nf_ct_icmp_timeout,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
@@ -337,62 +289,6 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
-static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
-                                     struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
-        pn->ctl_table = kmemdup(icmp_sysctl_table,
-                                sizeof(icmp_sysctl_table),
-                                GFP_KERNEL);
-        if (!pn->ctl_table)
-                return -ENOMEM;
-        pn->ctl_table[0].data = &in->timeout;
-#endif
-        return 0;
-}
-static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
-                                            struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-        pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
-                                       sizeof(icmp_compat_sysctl_table),
-                                       GFP_KERNEL);
-        if (!pn->ctl_compat_table)
-                return -ENOMEM;
-        pn->ctl_compat_table[0].data = &in->timeout;
-#endif
-#endif
-        return 0;
-}
-static int icmp_init_net(struct net *net, u_int16_t proto)
-{
-        int ret;
-        struct nf_icmp_net *in = icmp_pernet(net);
-        struct nf_proto_net *pn = &in->pn;
-        in->timeout = nf_ct_icmp_timeout;
-        ret = icmp_kmemdup_compat_sysctl_table(pn, in);
-        if (ret < 0)
-                return ret;
-        ret = icmp_kmemdup_sysctl_table(pn, in);
-        if (ret < 0)
-                nf_ct_kfree_compat_sysctl_table(pn);
-        return ret;
-}
-static struct nf_proto_net *icmp_get_net_proto(struct net *net)
-{
-        return &net->ct.nf_ct_proto.icmp.pn;
-}
 struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 {
        .l3proto                = PF_INET,
@@ -402,7 +298,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
        .invert_tuple           = icmp_invert_tuple,
        .print_tuple            = icmp_print_tuple,
        .packet                 = icmp_packet,
-        .get_timeouts           = icmp_get_timeouts,
        .new                    = icmp_new,
        .error                  = icmp_error,
        .destroy                = NULL,
@@ -413,15 +308,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
        .nlattr_to_tuple        = icmp_nlattr_to_tuple,
        .nla_policy             = icmp_nla_policy,
 #endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_SYSCTL
-        .ctnl_timeout           = {
+        .ctl_table_header       = &icmp_sysctl_header,
-                .nlattr_to_obj  = icmp_timeout_nlattr_to_obj,
+        .ctl_table              = icmp_sysctl_table,
-                .obj_to_nlattr  = icmp_timeout_obj_to_nlattr,
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-                .nlattr_max     = CTA_TIMEOUT_ICMP_MAX,
+        .ctl_compat_table       = icmp_compat_sysctl_table,
-                .obj_size       = sizeof(unsigned int),
+#endif
-                .nla_policy     = icmp_timeout_nla_policy,
+#endif
-        },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-        .init_net               = icmp_init_net,
-        .get_net_proto          = icmp_get_net_proto,
 };
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 742815518b0..9bb1b8a37a2 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
        {
                .hook           = ipv4_conntrack_defrag,
                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
+                .pf             = PF_INET,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
        },
        {
                .hook           = ipv4_conntrack_defrag,
                .owner          = THIS_MODULE,
-                .pf             = NFPROTO_IPV4,
+                .pf             = PF_INET,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
        },
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 9c3db10b22d..790f3160e01 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -15,12 +15,13 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <linux/netfilter/nf_conntrack_h323.h>
 /****************************************************************************/
-static int set_addr(struct sk_buff *skb, unsigned int protoff,
+static int set_addr(struct sk_buff *skb,
                    unsigned char **data, int dataoff,
                    unsigned int addroff, __be32 ip, __be16 port)
 {
@@ -39,9 +40,11 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
        if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
                if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-                                              protoff, addroff, sizeof(buf),
+                                              addroff, sizeof(buf),
                                              (char *) &buf, sizeof(buf))) {
-                        net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
+                        if (net_ratelimit())
+                                pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
+                                       " error\n");
                        return -1;
                }
@@ -53,9 +56,11 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
                *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
        } else {
                if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
-                                              protoff, addroff, sizeof(buf),
+                                              addroff, sizeof(buf),
                                              (char *) &buf, sizeof(buf))) {
-                        net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
+                        if (net_ratelimit())
+                                pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
+                                       " error\n");
                        return -1;
                }
                /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
@@ -68,22 +73,22 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
 }
 /****************************************************************************/
-static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
+static int set_h225_addr(struct sk_buff *skb,
                         unsigned char **data, int dataoff,
                         TransportAddress *taddr,
                         union nf_inet_addr *addr, __be16 port)
 {
-        return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
+        return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
                        addr->ip, port);
 }
 /****************************************************************************/
-static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
+static int set_h245_addr(struct sk_buff *skb,
                         unsigned char **data, int dataoff,
                         H245_TransportAddress *taddr,
                         union nf_inet_addr *addr, __be16 port)
 {
-        return set_addr(skb, protoff, data, dataoff,
+        return set_addr(skb, data, dataoff,
                        taddr->unicastAddress.iPAddress.network,
                        addr->ip, port);
 }
@@ -91,10 +96,10 @@ static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
 /****************************************************************************/
 static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                        enum ip_conntrack_info ctinfo,
-                        unsigned int protoff, unsigned char **data,
+                        unsigned char **data,
                        TransportAddress *taddr, int count)
 {
-        const struct nf_ct_h323_master *info = nfct_help_data(ct);
+        const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
        int dir = CTINFO2DIR(ctinfo);
        int i;
        __be16 port;
@@ -117,8 +122,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                                         &addr.ip, port,
                                         &ct->tuplehash[!dir].tuple.dst.u3.ip,
                                         info->sig_port[!dir]);
-                                return set_h225_addr(skb, protoff, data, 0,
+                                return set_h225_addr(skb, data, 0, &taddr[i],
-                                                     &taddr[i],
                                                     &ct->tuplehash[!dir].
                                                     tuple.dst.u3,
                                                     info->sig_port[!dir]);
@@ -129,8 +133,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
                                         &addr.ip, port,
                                         &ct->tuplehash[!dir].tuple.src.u3.ip,
                                         info->sig_port[!dir]);
-                                return set_h225_addr(skb, protoff, data, 0,
+                                return set_h225_addr(skb, data, 0, &taddr[i],
-                                                     &taddr[i],
                                                     &ct->tuplehash[!dir].
                                                     tuple.src.u3,
                                                     info->sig_port[!dir]);
@@ -144,7 +147,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
                        enum ip_conntrack_info ctinfo,
-                        unsigned int protoff, unsigned char **data,
+                        unsigned char **data,
                        TransportAddress *taddr, int count)
 {
        int dir = CTINFO2DIR(ctinfo);
@@ -160,7 +163,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
                                 &addr.ip, ntohs(port),
                                 &ct->tuplehash[!dir].tuple.dst.u3.ip,
                                 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
-                        return set_h225_addr(skb, protoff, data, 0, &taddr[i],
+                        return set_h225_addr(skb, data, 0, &taddr[i],
                                             &ct->tuplehash[!dir].tuple.dst.u3,
                                             ct->tuplehash[!dir].tuple.
                                                                dst.u.udp.port);
@@ -173,13 +176,13 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
                        enum ip_conntrack_info ctinfo,
-                        unsigned int protoff, unsigned char **data, int dataoff,
+                        unsigned char **data, int dataoff,
                        H245_TransportAddress *taddr,
                        __be16 port, __be16 rtp_port,
                        struct nf_conntrack_expect *rtp_exp,
                        struct nf_conntrack_expect *rtcp_exp)
 {
-        struct nf_ct_h323_master *info = nfct_help_data(ct);
+        struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
        int dir = CTINFO2DIR(ctinfo);
        int i;
        u_int16_t nated_port;
@@ -211,7 +214,8 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
        /* Run out of expectations */
        if (i >= H323_RTP_CHANNEL_MAX) {
-                net_notice_ratelimited("nf_nat_h323: out of expectations\n");
+                if (net_ratelimit())
+                        pr_notice("nf_nat_h323: out of expectations\n");
                return 0;
        }
@@ -240,12 +244,13 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
        }
        if (nated_port == 0) {  /* No port available */
-                net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
+                if (net_ratelimit())
+                        pr_notice("nf_nat_h323: out of RTP ports\n");
                return 0;
        }
        /* Modify signal */
-        if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+        if (set_h245_addr(skb, data, dataoff, taddr,
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons((port & htons(1)) ? nated_port + 1 :
                                                    nated_port)) == 0) {
@@ -276,7 +281,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
                    enum ip_conntrack_info ctinfo,
-                    unsigned int protoff, unsigned char **data, int dataoff,
+                    unsigned char **data, int dataoff,
                    H245_TransportAddress *taddr, __be16 port,
                    struct nf_conntrack_expect *exp)
 {
@@ -303,12 +308,13 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
        }
        if (nated_port == 0) {  /* No port available */
-                net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
+                if (net_ratelimit())
+                        pr_notice("nf_nat_h323: out of TCP ports\n");
                return 0;
        }
        /* Modify signal */
-        if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+        if (set_h245_addr(skb, data, dataoff, taddr,
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons(nated_port)) < 0) {
                nf_ct_unexpect_related(exp);
@@ -327,11 +333,11 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 /****************************************************************************/
 static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
                    enum ip_conntrack_info ctinfo,
-                    unsigned int protoff, unsigned char **data, int dataoff,
+                    unsigned char **data, int dataoff,
                    TransportAddress *taddr, __be16 port,
                    struct nf_conntrack_expect *exp)
 {
-        struct nf_ct_h323_master *info = nfct_help_data(ct);
+        struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
        int dir = CTINFO2DIR(ctinfo);
        u_int16_t nated_port = ntohs(port);
@@ -359,12 +365,13 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
        }
        if (nated_port == 0) {  /* No port available */
-                net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+                if (net_ratelimit())
+                        pr_notice("nf_nat_q931: out of TCP ports\n");
                return 0;
        }
        /* Modify signal */
-        if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+        if (set_h225_addr(skb, data, dataoff, taddr,
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons(nated_port)) == 0) {
                /* Save ports */
@@ -402,27 +409,25 @@ static void ip_nat_q931_expect(struct nf_conn *new,
        BUG_ON(new->status & IPS_NAT_DONE_MASK);
        /* Change src to where master sends to */
-        range.flags = NF_NAT_RANGE_MAP_IPS;
+        range.flags = IP_NAT_RANGE_MAP_IPS;
-        range.min_addr = range.max_addr =
+        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
-            new->tuplehash[!this->dir].tuple.src.u3;
+        nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
-        nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
        /* For DST manip, map port here to where it's expected. */
-        range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
-        range.min_proto = range.max_proto = this->saved_proto;
+        range.min = range.max = this->saved_proto;
-        range.min_addr = range.max_addr =
+        range.min_ip = range.max_ip =
-            new->master->tuplehash[!this->dir].tuple.src.u3;
+            new->master->tuplehash[!this->dir].tuple.src.u3.ip;
-        nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+        nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
 }
 /****************************************************************************/
 static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
                    enum ip_conntrack_info ctinfo,
-                    unsigned int protoff, unsigned char **data,
+                    unsigned char **data, TransportAddress *taddr, int idx,
-                    TransportAddress *taddr, int idx,
                    __be16 port, struct nf_conntrack_expect *exp)
 {
-        struct nf_ct_h323_master *info = nfct_help_data(ct);
+        struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
        int dir = CTINFO2DIR(ctinfo);
        u_int16_t nated_port = ntohs(port);
        union nf_inet_addr addr;
@@ -451,12 +456,13 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
        }
        if (nated_port == 0) {  /* No port available */
-                net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
+                if (net_ratelimit())
+                        pr_notice("nf_nat_ras: out of TCP ports\n");
                return 0;
        }
        /* Modify signal */
-        if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
+        if (set_h225_addr(skb, data, 0, &taddr[idx],
                          &ct->tuplehash[!dir].tuple.dst.u3,
                          htons(nated_port)) == 0) {
                /* Save ports */
@@ -467,7 +473,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
                if (idx > 0 &&
                    get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
                    (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
-                        set_h225_addr(skb, protoff, data, 0, &taddr[0],
+                        set_h225_addr(skb, data, 0, &taddr[0],
                                      &ct->tuplehash[!dir].tuple.dst.u3,
                                      info->sig_port[!dir]);
                }
@@ -496,22 +502,20 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new,
        BUG_ON(new->status & IPS_NAT_DONE_MASK);
        /* Change src to where master sends to */
-        range.flags = NF_NAT_RANGE_MAP_IPS;
+        range.flags = IP_NAT_RANGE_MAP_IPS;
-        range.min_addr = range.max_addr =
+        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
-            new->tuplehash[!this->dir].tuple.src.u3;
+        nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
-        nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
        /* For DST manip, map port here to where it's expected. */
-        range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
-        range.min_proto = range.max_proto = this->saved_proto;
+        range.min = range.max = this->saved_proto;
-        range.min_addr = range.max_addr = this->saved_addr;
+        range.min_ip = range.max_ip = this->saved_ip;
-        nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+        nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
 }
 /****************************************************************************/
 static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
                              enum ip_conntrack_info ctinfo,
-                              unsigned int protoff,
                              unsigned char **data, int dataoff,
                              TransportAddress *taddr, __be16 port,
                              struct nf_conntrack_expect *exp)
@@ -520,7 +524,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
        u_int16_t nated_port;
        /* Set expectations for NAT */
-        exp->saved_addr = exp->tuple.dst.u3;
+        exp->saved_ip = exp->tuple.dst.u3.ip;
        exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
        exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
        exp->expectfn = ip_nat_callforwarding_expect;
@@ -541,12 +545,13 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
        }
        if (nated_port == 0) {  /* No port available */
-                net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+                if (net_ratelimit())
+                        pr_notice("nf_nat_q931: out of TCP ports\n");
                return 0;
        }
        /* Modify signal */
-        if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
+        if (!set_h225_addr(skb, data, dataoff, taddr,
                           &ct->tuplehash[!dir].tuple.dst.u3,
                           htons(nated_port)) == 0) {
                nf_ct_unexpect_related(exp);
@@ -563,16 +568,6 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
        return 0;
 }
-static struct nf_ct_helper_expectfn q931_nat = {
-        .name           = "Q.931",
-        .expectfn       = ip_nat_q931_expect,
-};
-static struct nf_ct_helper_expectfn callforwarding_nat = {
-        .name           = "callforwarding",
-        .expectfn       = ip_nat_callforwarding_expect,
-};
 /****************************************************************************/
 static int __init init(void)
 {
@@ -586,34 +581,30 @@ static int __init init(void)
        BUG_ON(nat_callforwarding_hook != NULL);
        BUG_ON(nat_q931_hook != NULL);
-        RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
+        rcu_assign_pointer(set_h245_addr_hook, set_h245_addr);
-        RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
+        rcu_assign_pointer(set_h225_addr_hook, set_h225_addr);
-        RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
+        rcu_assign_pointer(set_sig_addr_hook, set_sig_addr);
-        RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
+        rcu_assign_pointer(set_ras_addr_hook, set_ras_addr);
-        RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
+        rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp);
-        RCU_INIT_POINTER(nat_t120_hook, nat_t120);
+        rcu_assign_pointer(nat_t120_hook, nat_t120);
-        RCU_INIT_POINTER(nat_h245_hook, nat_h245);
+        rcu_assign_pointer(nat_h245_hook, nat_h245);
-        RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
+        rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
-        RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+        rcu_assign_pointer(nat_q931_hook, nat_q931);
-        nf_ct_helper_expectfn_register(&q931_nat);
-        nf_ct_helper_expectfn_register(&callforwarding_nat);
        return 0;
 }
 /****************************************************************************/
 static void __exit fini(void)
 {
-        RCU_INIT_POINTER(set_h245_addr_hook, NULL);
+        rcu_assign_pointer(set_h245_addr_hook, NULL);
-        RCU_INIT_POINTER(set_h225_addr_hook, NULL);
+        rcu_assign_pointer(set_h225_addr_hook, NULL);
-        RCU_INIT_POINTER(set_sig_addr_hook, NULL);
+        rcu_assign_pointer(set_sig_addr_hook, NULL);
-        RCU_INIT_POINTER(set_ras_addr_hook, NULL);
+        rcu_assign_pointer(set_ras_addr_hook, NULL);
-        RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
+        rcu_assign_pointer(nat_rtp_rtcp_hook, NULL);
-        RCU_INIT_POINTER(nat_t120_hook, NULL);
+        rcu_assign_pointer(nat_t120_hook, NULL);
-        RCU_INIT_POINTER(nat_h245_hook, NULL);
+        rcu_assign_pointer(nat_h245_hook, NULL);
-        RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
+        rcu_assign_pointer(nat_callforwarding_hook, NULL);
-        RCU_INIT_POINTER(nat_q931_hook, NULL);
+        rcu_assign_pointer(nat_q931_hook, NULL);
-        nf_ct_helper_expectfn_unregister(&q931_nat);
-        nf_ct_helper_expectfn_unregister(&callforwarding_nat);
        synchronize_rcu();
 }
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
deleted file mode 100644
index d8b2e14efdd..00000000000
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/secure_seq.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <net/ip.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
-#ifdef CONFIG_XFRM
-static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
-                                       const struct nf_conn *ct,
-                                       enum ip_conntrack_dir dir,
-                                       unsigned long statusbit,
-                                       struct flowi *fl)
-{
-        const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
-        struct flowi4 *fl4 = &fl->u.ip4;
-        if (ct->status & statusbit) {
-                fl4->daddr = t->dst.u3.ip;
-                if (t->dst.protonum == IPPROTO_TCP ||
-                    t->dst.protonum == IPPROTO_UDP ||
-                    t->dst.protonum == IPPROTO_UDPLITE ||
-                    t->dst.protonum == IPPROTO_DCCP ||
-                    t->dst.protonum == IPPROTO_SCTP)
-                        fl4->fl4_dport = t->dst.u.all;
-        }
-        statusbit ^= IPS_NAT_MASK;
-        if (ct->status & statusbit) {
-                fl4->saddr = t->src.u3.ip;
-                if (t->dst.protonum == IPPROTO_TCP ||
-                    t->dst.protonum == IPPROTO_UDP ||
-                    t->dst.protonum == IPPROTO_UDPLITE ||
-                    t->dst.protonum == IPPROTO_DCCP ||
-                    t->dst.protonum == IPPROTO_SCTP)
-                        fl4->fl4_sport = t->src.u.all;
-        }
-}
-#endif /* CONFIG_XFRM */
-static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
-                                 const struct nf_nat_range *range)
-{
-        return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
-               ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
-}
-static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
-                                   __be16 dport)
-{
-        return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
-}
-static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
-                                  unsigned int iphdroff,
-                                  const struct nf_nat_l4proto *l4proto,
-                                  const struct nf_conntrack_tuple *target,
-                                  enum nf_nat_manip_type maniptype)
-{
-        struct iphdr *iph;
-        unsigned int hdroff;
-        if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
-                return false;
-        iph = (void *)skb->data + iphdroff;
-        hdroff = iphdroff + iph->ihl * 4;
-        if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
-                                target, maniptype))
-                return false;
-        iph = (void *)skb->data + iphdroff;
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
-                iph->saddr = target->src.u3.ip;
-        } else {
-                csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
-                iph->daddr = target->dst.u3.ip;
-        }
-        return true;
-}
-static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
-                                    unsigned int iphdroff, __sum16 *check,
-                                    const struct nf_conntrack_tuple *t,
-                                    enum nf_nat_manip_type maniptype)
-{
-        struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
-        __be32 oldip, newip;
-        if (maniptype == NF_NAT_MANIP_SRC) {
-                oldip = iph->saddr;
-                newip = t->src.u3.ip;
-        } else {
-                oldip = iph->daddr;
-                newip = t->dst.u3.ip;
-        }
-        inet_proto_csum_replace4(check, skb, oldip, newip, 1);
-}
-static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
-                                    u8 proto, void *data, __sum16 *check,
-                                    int datalen, int oldlen)
-{
-        const struct iphdr *iph = ip_hdr(skb);
-        struct rtable *rt = skb_rtable(skb);
-        if (skb->ip_summed != CHECKSUM_PARTIAL) {
-                if (!(rt->rt_flags & RTCF_LOCAL) &&
-                    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
-                        skb->ip_summed = CHECKSUM_PARTIAL;
-                        skb->csum_start = skb_headroom(skb) +
-                                          skb_network_offset(skb) +
-                                          ip_hdrlen(skb);
-                        skb->csum_offset = (void *)check - data;
-                        *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                    datalen, proto, 0);
-                } else {
-                        *check = 0;
-                        *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-                                                   datalen, proto,
-                                                   csum_partial(data, datalen,
-                                                                0));
-                        if (proto == IPPROTO_UDP && !*check)
-                                *check = CSUM_MANGLED_0;
-                }
-        } else
-                inet_proto_csum_replace2(check, skb,
-                                         htons(oldlen), htons(datalen), 1);
-}
-static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
-                                       struct nf_nat_range *range)
-{
-        if (tb[CTA_NAT_V4_MINIP]) {
-                range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
-                range->flags |= NF_NAT_RANGE_MAP_IPS;
-        }
-        if (tb[CTA_NAT_V4_MAXIP])
-                range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
-        else
-                range->max_addr.ip = range->min_addr.ip;
-        return 0;
-}
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
-        .l3proto                = NFPROTO_IPV4,
-        .in_range               = nf_nat_ipv4_in_range,
-        .secure_port            = nf_nat_ipv4_secure_port,
-        .manip_pkt              = nf_nat_ipv4_manip_pkt,
-        .csum_update            = nf_nat_ipv4_csum_update,
-        .csum_recalc            = nf_nat_ipv4_csum_recalc,
-        .nlattr_to_range        = nf_nat_ipv4_nlattr_to_range,
-#ifdef CONFIG_XFRM
-        .decode_session         = nf_nat_ipv4_decode_session,
-#endif
-};
-int nf_nat_icmp_reply_translation(struct sk_buff *skb,
-                                  struct nf_conn *ct,
-                                  enum ip_conntrack_info ctinfo,
-                                  unsigned int hooknum)
-{
-        struct {
-                struct icmphdr  icmp;
-                struct iphdr    ip;
-        } *inside;
-        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
-        unsigned int hdrlen = ip_hdrlen(skb);
-        const struct nf_nat_l4proto *l4proto;
-        struct nf_conntrack_tuple target;
-        unsigned long statusbit;
-        NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
-        if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
-                return 0;
-        if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
-                return 0;
-        inside = (void *)skb->data + hdrlen;
-        if (inside->icmp.type == ICMP_REDIRECT) {
-                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
-                        return 0;
-                if (ct->status & IPS_NAT_MASK)
-                        return 0;
-        }
-        if (manip == NF_NAT_MANIP_SRC)
-                statusbit = IPS_SRC_NAT;
-        else
-                statusbit = IPS_DST_NAT;
-        /* Invert if this is reply direction */
-        if (dir == IP_CT_DIR_REPLY)
-                statusbit ^= IPS_NAT_MASK;
-        if (!(ct->status & statusbit))
-                return 1;
-        l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
-        if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
-                                   l4proto, &ct->tuplehash[!dir].tuple, !manip))
-                return 0;
-        if (skb->ip_summed != CHECKSUM_PARTIAL) {
-                /* Reloading "inside" here since manip_pkt may reallocate */
-                inside = (void *)skb->data + hdrlen;
-                inside->icmp.checksum = 0;
-                inside->icmp.checksum =
-                        csum_fold(skb_checksum(skb, hdrlen,
-                                               skb->len - hdrlen, 0));
-        }
-        /* Change outer to look like the reply to an incoming packet */
-        nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-        l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
-        if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
-                return 0;
-        return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-static int __init nf_nat_l3proto_ipv4_init(void)
-{
-        int err;
-        err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-        if (err < 0)
-                goto err1;
-        err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
-        if (err < 0)
-                goto err2;
-        return err;
-err2:
-        nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-err1:
-        return err;
-}
-static void __exit nf_nat_l3proto_ipv4_exit(void)
-{
-        nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
-        nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-}
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
-module_init(nf_nat_l3proto_ipv4_init);
-module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index a06d7d74817..4c060038d29 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -22,6 +22,7 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -48,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
        const struct nf_nat_pptp *nat_pptp_info;
        struct nf_nat_range range;
-        ct_pptp_info = nfct_help_data(master);
+        ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
        nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
        /* And here goes the grand finale of corrosion... */
@@ -87,24 +88,24 @@ static void pptp_nat_expected(struct nf_conn *ct,
        BUG_ON(ct->status & IPS_NAT_DONE_MASK);
        /* Change src to where master sends to */
-        range.flags = NF_NAT_RANGE_MAP_IPS;
+        range.flags = IP_NAT_RANGE_MAP_IPS;
-        range.min_addr = range.max_addr
+        range.min_ip = range.max_ip
-                = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+                = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
        if (exp->dir == IP_CT_DIR_ORIGINAL) {
-                range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+                range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
-                range.min_proto = range.max_proto = exp->saved_proto;
+                range.min = range.max = exp->saved_proto;
        }
-        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+        nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
        /* For DST manip, map port here to where it's expected. */
-        range.flags = NF_NAT_RANGE_MAP_IPS;
+        range.flags = IP_NAT_RANGE_MAP_IPS;
-        range.min_addr = range.max_addr
+        range.min_ip = range.max_ip
-                = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+                = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
        if (exp->dir == IP_CT_DIR_REPLY) {
-                range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+                range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
-                range.min_proto = range.max_proto = exp->saved_proto;
+                range.min = range.max = exp->saved_proto;
        }
-        nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+        nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
 }
 /* outbound packets == from PNS to PAC */
@@ -112,7 +113,6 @@ static int
 pptp_outbound_pkt(struct sk_buff *skb,
                  struct nf_conn *ct,
                  enum ip_conntrack_info ctinfo,
-                  unsigned int protoff,
                  struct PptpControlHeader *ctlh,
                  union pptp_ctrl_union *pptpReq)
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
        __be16 new_callid;
        unsigned int cid_off;
-        ct_pptp_info = nfct_help_data(ct);
+        ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
        nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
        new_callid = ct_pptp_info->pns_call_id;
@@ -175,7 +175,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
                 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
        /* mangle packet */
-        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
                                     cid_off + sizeof(struct pptp_pkt_hdr) +
                                     sizeof(struct PptpControlHeader),
                                     sizeof(new_callid), (char *)&new_callid,
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
        struct nf_ct_pptp_master *ct_pptp_info;
        struct nf_nat_pptp *nat_pptp_info;
-        ct_pptp_info = nfct_help_data(ct);
+        ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
        nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
        /* save original PAC call ID in nat_info */
@@ -216,7 +216,6 @@ static int
 pptp_inbound_pkt(struct sk_buff *skb,
                 struct nf_conn *ct,
                 enum ip_conntrack_info ctinfo,
-                 unsigned int protoff,
                 struct PptpControlHeader *ctlh,
                 union pptp_ctrl_union *pptpReq)
 {
@@ -269,7 +268,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
        pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
                 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
-        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+        if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
                                     pcid_off + sizeof(struct pptp_pkt_hdr) +
                                     sizeof(struct PptpControlHeader),
                                     sizeof(new_pcid), (char *)&new_pcid,
@@ -283,25 +282,25 @@ static int __init nf_nat_helper_pptp_init(void)
        nf_nat_need_gre();
        BUG_ON(nf_nat_pptp_hook_outbound != NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
+        rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
        BUG_ON(nf_nat_pptp_hook_inbound != NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
+        rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
        BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+        rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
        BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
+        rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
        return 0;
 }
 static void __exit nf_nat_helper_pptp_fini(void)
 {
-        RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
+        rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
+        rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
+        rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL);
-        RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+        rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL);
        synchronize_rcu();
 }
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index ea44f02563b..bc8d83a31c7 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -28,7 +28,8 @@
 #include <linux/ip.h>
 #include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
 MODULE_LICENSE("GPL");
@@ -37,8 +38,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 /* generate unique tuple ... */
 static void
-gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
+gre_unique_tuple(struct nf_conntrack_tuple *tuple,
-                 struct nf_conntrack_tuple *tuple,
                 const struct nf_nat_range *range,
                 enum nf_nat_manip_type maniptype,
                 const struct nf_conn *ct)
@@ -52,18 +52,18 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
        if (!ct->master)
                return;
-        if (maniptype == NF_NAT_MANIP_SRC)
+        if (maniptype == IP_NAT_MANIP_SRC)
                keyptr = &tuple->src.u.gre.key;
        else
                keyptr = &tuple->dst.u.gre.key;
-        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
                pr_debug("%p: NATing GRE PPTP\n", ct);
                min = 1;
                range_size = 0xffff;
        } else {
-                min = ntohs(range->min_proto.gre.key);
+                min = ntohs(range->min.gre.key);
-                range_size = ntohs(range->max_proto.gre.key) - min + 1;
+                range_size = ntohs(range->max.gre.key) - min + 1;
        }
        pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +80,14 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
 /* manipulate a GRE packet according to maniptype */
 static bool
-gre_manip_pkt(struct sk_buff *skb,
+gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
-              const struct nf_nat_l3proto *l3proto,
-              unsigned int iphdroff, unsigned int hdroff,
              const struct nf_conntrack_tuple *tuple,
              enum nf_nat_manip_type maniptype)
 {
        const struct gre_hdr *greh;
        struct gre_hdr_pptp *pgreh;
+        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+        unsigned int hdroff = iphdroff + iph->ihl * 4;
        /* pgreh includes two optional 32bit fields which are not required
         * to be there.  That's where the magic '8' comes from */
@@ -99,7 +99,7 @@ gre_manip_pkt(struct sk_buff *skb,
        /* we only have destination manip of a packet, since 'source key'
         * is not present in the packet itself */
-        if (maniptype != NF_NAT_MANIP_DST)
+        if (maniptype != IP_NAT_MANIP_DST)
                return true;
        switch (greh->version) {
        case GRE_VERSION_1701:
@@ -117,24 +117,26 @@ gre_manip_pkt(struct sk_buff *skb,
        return true;
 }
-static const struct nf_nat_l4proto gre = {
+static const struct nf_nat_protocol gre = {
-        .l4proto                = IPPROTO_GRE,
+        .protonum               = IPPROTO_GRE,
+        .me                     = THIS_MODULE,
        .manip_pkt              = gre_manip_pkt,
-        .in_range               = nf_nat_l4proto_in_range,
+        .in_range               = nf_nat_proto_in_range,
        .unique_tuple           = gre_unique_tuple,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_l4proto_nlattr_to_range,
+        .range_to_nlattr        = nf_nat_proto_range_to_nlattr,
+        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
 #endif
 };
 static int __init nf_nat_proto_gre_init(void)
 {
-        return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
+        return nf_nat_protocol_register(&gre);
 }
 static void __exit nf_nat_proto_gre_fini(void)
 {
-        nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
+        nf_nat_protocol_unregister(&gre);
 }
 module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf..5744c3ec847 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -8,14 +8,14 @@
 #include <linux/types.h>
 #include <linux/init.h>
-#include <linux/export.h>
 #include <linux/ip.h>
 #include <linux/icmp.h>
 #include <linux/netfilter.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
 static bool
 icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -28,8 +28,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 }
 static void
-icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
-                  struct nf_conntrack_tuple *tuple,
                  const struct nf_nat_range *range,
                  enum nf_nat_manip_type maniptype,
                  const struct nf_conn *ct)
@@ -38,14 +37,13 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
        unsigned int range_size;
        unsigned int i;
-        range_size = ntohs(range->max_proto.icmp.id) -
+        range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
-                     ntohs(range->min_proto.icmp.id) + 1;
        /* If no range specified... */
-        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
+        if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
                range_size = 0xFFFF;
        for (i = 0; ; ++id) {
-                tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
+                tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
                                             (id % range_size));
                if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
                        return;
@@ -55,12 +53,13 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 static bool
 icmp_manip_pkt(struct sk_buff *skb,
-               const struct nf_nat_l3proto *l3proto,
+               unsigned int iphdroff,
-               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
 {
+        const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
        struct icmphdr *hdr;
+        unsigned int hdroff = iphdroff + iph->ihl*4;
        if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
                return false;
@@ -72,12 +71,14 @@ icmp_manip_pkt(struct sk_buff *skb,
        return true;
 }
-const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
+const struct nf_nat_protocol nf_nat_protocol_icmp = {
-        .l4proto                = IPPROTO_ICMP,
+        .protonum               = IPPROTO_ICMP,
+        .me                     = THIS_MODULE,
        .manip_pkt              = icmp_manip_pkt,
        .in_range               = icmp_in_range,
        .unique_tuple           = icmp_unique_tuple,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-        .nlattr_to_range        = nf_nat_l4proto_nlattr_to_range,
+        .range_to_nlattr        = nf_nat_proto_range_to_nlattr,
+        .nlattr_to_range        = nf_nat_proto_nlattr_to_range,
 #endif
 };
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index bac712293fd..076b7c8c4aa 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -400,12 +400,15 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
        *len = 0;
        *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
-        if (*octets == NULL)
+        if (*octets == NULL) {
+                if (net_ratelimit())
+                        pr_notice("OOM in bsalg (%d)\n", __LINE__);
                return 0;
+        }
        ptr = *octets;
        while (ctx->pointer < eoc) {
-                if (!asn1_octet_decode(ctx, ptr++)) {
+                if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
                        kfree(*octets);
                        *octets = NULL;
                        return 0;
@@ -448,8 +451,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
                return 0;
        *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
-        if (*oid == NULL)
+        if (*oid == NULL) {
+                if (net_ratelimit())
+                        pr_notice("OOM in bsalg (%d)\n", __LINE__);
                return 0;
+        }
        optr = *oid;
@@ -722,6 +728,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
                if (*obj == NULL) {
                        kfree(id);
+                        if (net_ratelimit())
+                                pr_notice("OOM in bsalg (%d)\n", __LINE__);
                        return 0;
                }
                (*obj)->syntax.l[0] = l;
@@ -736,6 +744,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                if (*obj == NULL) {
                        kfree(p);
                        kfree(id);
+                        if (net_ratelimit())
+                                pr_notice("OOM in bsalg (%d)\n", __LINE__);
                        return 0;
                }
                memcpy((*obj)->syntax.c, p, len);
@@ -749,6 +759,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
                if (*obj == NULL) {
                        kfree(id);
+                        if (net_ratelimit())
+                                pr_notice("OOM in bsalg (%d)\n", __LINE__);
                        return 0;
                }
                if (!asn1_null_decode(ctx, end)) {
@@ -759,7 +771,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                }
                break;
        case SNMP_OBJECTID:
-                if (!asn1_oid_decode(ctx, end, &lp, &len)) {
+                if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
                        kfree(id);
                        return 0;
                }
@@ -768,6 +780,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                if (*obj == NULL) {
                        kfree(lp);
                        kfree(id);
+                        if (net_ratelimit())
+                                pr_notice("OOM in bsalg (%d)\n", __LINE__);
                        return 0;
                }
                memcpy((*obj)->syntax.ul, lp, len);
@@ -787,6 +801,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                if (*obj == NULL) {
                        kfree(p);
                        kfree(id);
+                        if (net_ratelimit())
+                                pr_notice("OOM in bsalg (%d)\n", __LINE__);
                        return 0;
                }
                memcpy((*obj)->syntax.uc, p, len);
@@ -803,6 +819,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
                *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
                if (*obj == NULL) {
                        kfree(id);
+                        if (net_ratelimit())
+                                pr_notice("OOM in bsalg (%d)\n", __LINE__);
                        return 0;
                }
                (*obj)->syntax.ul[0] = ul;
@@ -1206,7 +1224,8 @@ static int snmp_translate(struct nf_conn *ct,
        if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
                               paylen, &map, &udph->check)) {
-                net_warn_ratelimited("bsalg: parser failed\n");
+                if (net_ratelimit())
+                        printk(KERN_WARNING "bsalg: parser failed\n");
                return NF_DROP;
        }
        return NF_ACCEPT;
@@ -1240,8 +1259,9 @@ static int help(struct sk_buff *skb, unsigned int protoff,
         * can mess around with the payload.
         */
        if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
-                net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+                 if (net_ratelimit())
-                                     &iph->saddr, &iph->daddr);
+                         printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+                                &iph->saddr, &iph->daddr);
                 return NF_DROP;
        }
@@ -1290,7 +1310,7 @@ static int __init nf_nat_snmp_basic_init(void)
        int ret = 0;
        BUG_ON(nf_nat_snmp_hook != NULL);
-        RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+        rcu_assign_pointer(nf_nat_snmp_hook, help);
        ret = nf_conntrack_helper_register(&snmp_trap_helper);
        if (ret < 0) {
@@ -1302,7 +1322,7 @@ static int __init nf_nat_snmp_basic_init(void)
 static void __exit nf_nat_snmp_basic_fini(void)
 {
-        RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+        rcu_assign_pointer(nf_nat_snmp_hook, NULL);
        nf_conntrack_helper_unregister(&snmp_trap_helper);
 }
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8f3d05424a3..39b403f854c 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -20,6 +20,7 @@
 *
 */
+#include <asm/system.h>
 #include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
@@ -38,7 +39,6 @@
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
-#include <linux/export.h>
 #include <net/sock.h>
 #include <net/ping.h>
 #include <net/udp.h>
@@ -51,16 +51,15 @@ static struct ping_table ping_table;
 static u16 ping_port_rover;
-static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
+static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
 {
        int res = (num + net_hash_mix(net)) & mask;
        pr_debug("hash(%d) = %d\n", num, res);
        return res;
 }
 static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
-                                             struct net *net, unsigned int num)
+                                             struct net *net, unsigned num)
 {
        return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
 }
@@ -140,14 +139,13 @@ static void ping_v4_unhash(struct sock *sk)
                write_lock_bh(&ping_table.lock);
                hlist_nulls_del(&sk->sk_nulls_node);
                sock_put(sk);
-                isk->inet_num = 0;
+                isk->inet_num = isk->inet_sport = 0;
-                isk->inet_sport = 0;
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
                write_unlock_bh(&ping_table.lock);
        }
 }
-static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr,
+static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
                                   u16 ident, int dif)
 {
        struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
@@ -155,15 +153,15 @@ static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr,
        struct inet_sock *isk;
        struct hlist_nulls_node *hnode;
-        pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+        pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
-                 (int)ident, &daddr, dif);
+                         (int)ident, (unsigned long)daddr, dif);
        read_lock_bh(&ping_table.lock);
        ping_portaddr_for_each_entry(sk, hnode, hslot) {
                isk = inet_sk(sk);
-                pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk,
+                pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
-                         (int)isk->inet_num, &isk->inet_rcv_saddr,
+                         (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
                         sk->sk_bound_dev_if);
                pr_debug("iterate\n");
@@ -185,12 +183,11 @@ exit:
        return sk;
 }
-static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
-                                          kgid_t *high)
+                                          gid_t *high)
 {
-        kgid_t *data = net->ipv4.sysctl_ping_group_range;
+        gid_t *data = net->ipv4.sysctl_ping_group_range;
-        unsigned int seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -203,20 +200,21 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
 static int ping_init_sock(struct sock *sk)
 {
        struct net *net = sock_net(sk);
-        kgid_t group = current_egid();
+        gid_t group = current_egid();
+        gid_t range[2];
        struct group_info *group_info = get_current_groups();
        int i, j, count = group_info->ngroups;
-        kgid_t low, high;
-        inet_get_ping_group_range_net(net, &low, &high);
+        inet_get_ping_group_range_net(net, range, range+1);
-        if (gid_lte(low, group) && gid_lte(group, high))
+        if (range[0] <= group && group <= range[1])
                return 0;
        for (i = 0; i < group_info->nblocks; i++) {
                int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
                for (j = 0; j < cp_count; j++) {
-                        kgid_t gid = group_info->blocks[i][j];
+                        group = group_info->blocks[i][j];
-                        if (gid_lte(low, gid) && gid_lte(gid, high))
+                        if (range[0] <= group && group <= range[1])
                                return 0;
                }
@@ -229,7 +227,7 @@ static int ping_init_sock(struct sock *sk)
 static void ping_close(struct sock *sk, long timeout)
 {
        pr_debug("ping_close(sk=%p,sk->num=%u)\n",
-                 inet_sk(sk), inet_sk(sk)->inet_num);
+                inet_sk(sk), inet_sk(sk)->inet_num);
        pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
        sk_common_release(sk);
@@ -252,10 +250,10 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                return -EINVAL;
        pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
-                 sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
+                sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
        chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
-        if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+        if (addr->sin_addr.s_addr == INADDR_ANY)
                chk_addr_ret = RTN_LOCAL;
        if ((sysctl_ip_nonlocal_bind == 0 &&
@@ -279,10 +277,10 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                goto out;
        }
-        pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n",
+        pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
-                 (int)isk->inet_num,
+                (int)isk->inet_num,
-                 &isk->inet_rcv_saddr,
+                (unsigned long) isk->inet_rcv_saddr,
-                 (int)sk->sk_bound_dev_if);
+                (int)sk->sk_bound_dev_if);
        err = 0;
        if (isk->inet_rcv_saddr)
@@ -335,11 +333,12 @@ void ping_err(struct sk_buff *skb, u32 info)
                return;
        pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
-                 code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+                code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
        sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
                            ntohs(icmph->un.echo.id), skb->dev->ifindex);
        if (sk == NULL) {
+                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
                pr_debug("no socket, dropping\n");
                return; /* No socket for error */
        }
@@ -365,7 +364,6 @@ void ping_err(struct sk_buff *skb, u32 info)
                break;
        case ICMP_DEST_UNREACH:
                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
-                        ipv4_sk_update_pmtu(skb, sk, info);
                        if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
                                err = EMSGSIZE;
                                harderr = 1;
@@ -381,7 +379,6 @@ void ping_err(struct sk_buff *skb, u32 info)
                break;
        case ICMP_REDIRECT:
                /* See ICMP_SOURCE_QUENCH */
-                ipv4_sk_redirect(skb, sk);
                err = EREMOTEIO;
                break;
        }
@@ -410,10 +407,10 @@ out:
 struct pingfakehdr {
        struct icmphdr icmph;
        struct iovec *iov;
-        __wsum wcheck;
+        u32 wcheck;
 };
-static int ping_getfrag(void *from, char *to,
+static int ping_getfrag(void *from, char * to,
                        int offset, int fraglen, int odd, struct sk_buff *skb)
 {
        struct pingfakehdr *pfh = (struct pingfakehdr *)from;
@@ -462,7 +459,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        struct rtable *rt = NULL;
        struct ip_options_data opt_copy;
        int free = 0;
-        __be32 saddr, daddr, faddr;
+        u32 saddr, daddr, faddr;
        u8  tos;
        int err;
@@ -558,8 +555,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        ipc.oif = inet->mc_index;
                if (!saddr)
                        saddr = inet->mc_addr;
-        } else if (!ipc.oif)
+        }
-                ipc.oif = inet->uc_index;
        flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
                           RT_SCOPE_UNIVERSE, sk->sk_protocol,
@@ -633,7 +629,6 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
-        err = -EOPNOTSUPP;
        if (flags & MSG_OOB)
                goto out;
@@ -681,8 +676,9 @@ out:
 static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
        pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
-                 inet_sk(sk), inet_sk(sk)->inet_num, skb);
+                inet_sk(sk), inet_sk(sk)->inet_num, skb);
        if (sock_queue_rcv_skb(sk, skb) < 0) {
+                ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
                kfree_skb(skb);
                pr_debug("ping_queue_rcv_skb -> failed\n");
                return -1;
@@ -701,13 +697,13 @@ void ping_rcv(struct sk_buff *skb)
        struct net *net = dev_net(skb->dev);
        struct iphdr *iph = ip_hdr(skb);
        struct icmphdr *icmph = icmp_hdr(skb);
-        __be32 saddr = iph->saddr;
+        u32 saddr = iph->saddr;
-        __be32 daddr = iph->daddr;
+        u32 daddr = iph->daddr;
        /* We assume the packet has already been checked by icmp_rcv */
        pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
-                 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+                skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
        /* Push ICMP header back */
        skb_push(skb, skb->data - (u8 *)icmph);
@@ -839,9 +835,7 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
-                0, 0L, 0,
+                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
-                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
-                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
                atomic_read(&sp->sk_drops), len);
 }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8de53e1ddd5..4bfad5da94f 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -42,7 +42,6 @@
 #include <linux/inetdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/export.h>
 #include <net/sock.h>
 #include <net/raw.h>
@@ -56,17 +55,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
        local_bh_disable();
        orphans = percpu_counter_sum_positive(&tcp_orphan_count);
-        sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
+        sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
        local_bh_enable();
        socket_seq_show(seq);
        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
                   sock_prot_inuse_get(net, &tcp_prot), orphans,
                   tcp_death_row.tw_count, sockets,
-                   proto_memory_allocated(&tcp_prot));
+                   atomic_long_read(&tcp_memory_allocated));
        seq_printf(seq, "UDP: inuse %d mem %ld\n",
                   sock_prot_inuse_get(net, &udp_prot),
-                   proto_memory_allocated(&udp_prot));
+                   atomic_long_read(&udp_memory_allocated));
        seq_printf(seq, "UDPLITE: inuse %d\n",
                   sock_prot_inuse_get(net, &udplite_prot));
        seq_printf(seq, "RAW: inuse %d\n",
@@ -216,6 +215,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
        SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
        SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+        SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
        SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
        SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
        SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
@@ -232,6 +232,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
        SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
        SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+        SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
        SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
        SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
        SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -255,18 +256,6 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
        SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
        SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
-        SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
-        SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
-        SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
-        SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
-        SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
-        SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
-        SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
-        SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
-        SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
-        SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
-        SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
-        SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
        SNMP_MIB_SENTINEL
 };
@@ -298,7 +287,7 @@ static void icmpmsg_put(struct seq_file *seq)
        count = 0;
        for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
-                val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
+                val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
                if (val) {
                        type[count] = i;
                        vals[count++] = val;
@@ -317,7 +306,6 @@ static void icmp_put(struct seq_file *seq)
 {
        int i;
        struct net *net = seq->private;
-        atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
        seq_puts(seq, "\nIcmp: InMsgs InErrors");
        for (i=0; icmpmibmap[i].name != NULL; i++)
@@ -330,13 +318,15 @@ static void icmp_put(struct seq_file *seq)
                snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
        for (i=0; icmpmibmap[i].name != NULL; i++)
                seq_printf(seq, " %lu",
-                           atomic_long_read(ptr + icmpmibmap[i].index));
+                        snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
+                                icmpmibmap[i].index));
        seq_printf(seq, " %lu %lu",
                snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
                snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
        for (i=0; icmpmibmap[i].name != NULL; i++)
                seq_printf(seq, " %lu",
-                           atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+                        snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
+                                icmpmibmap[i].index | 0x100));
 }
 /*
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0f9d09f54bd..9ae5c01cd0b 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,7 +29,6 @@
 #include <net/protocol.h>
 const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
-const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
 /*
 *      Add a protocol handler to the hash tables
@@ -37,17 +36,12 @@ const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-        return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+        int hash = protocol & (MAX_INET_PROTOS - 1);
-                        NULL, prot) ? 0 : -1;
-}
-EXPORT_SYMBOL(inet_add_protocol);
-int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+        return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
-{
-        return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
                        NULL, prot) ? 0 : -1;
 }
-EXPORT_SYMBOL(inet_add_offload);
+EXPORT_SYMBOL(inet_add_protocol);
 /*
 *      Remove a protocol from the hash tables.
@@ -55,9 +49,9 @@ EXPORT_SYMBOL(inet_add_offload);
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-        int ret;
+        int ret, hash = protocol & (MAX_INET_PROTOS - 1);
-        ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+        ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
                       prot, NULL) == prot) ? 0 : -1;
        synchronize_net();
@@ -65,16 +59,3 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
        return ret;
 }
 EXPORT_SYMBOL(inet_del_protocol);
-int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
-{
-        int ret;
-        ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
-                       prot, NULL) == prot) ? 0 : -1;
-        synchronize_net();
-        return ret;
-}
-EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 73d1e4df4bf..61714bd5292 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -48,7 +48,6 @@
 #include <linux/errno.h>
 #include <linux/aio.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/sockios.h>
 #include <linux/socket.h>
@@ -131,20 +130,18 @@ found:
 *      0 - deliver
 *      1 - block
 */
-static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
 {
-        struct icmphdr _hdr;
+        int type;
-        const struct icmphdr *hdr;
-        hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+        if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
-                                 sizeof(_hdr), &_hdr);
-        if (!hdr)
                return 1;
-        if (hdr->type < 32) {
+        type = icmp_hdr(skb)->type;
+        if (type < 32) {
                __u32 data = raw_sk(sk)->filter.data;
-                return ((1U << hdr->type) & data) != 0;
+                return ((1 << type) & data) != 0;
        }
        /* Do not block unknown ICMP types */
@@ -218,11 +215,6 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
        int err = 0;
        int harderr = 0;
-        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
-                ipv4_sk_update_pmtu(skb, sk, info);
-        else if (type == ICMP_REDIRECT)
-                ipv4_sk_redirect(skb, sk);
        /* Report error on raw socket, if:
           1. User requested ip_recverr.
           2. Socket is connected (otherwise the error indication
@@ -295,12 +287,11 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
        read_unlock(&raw_v4_hashinfo.lock);
 }
-static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
 {
        /* Charge it to the socket. */
-        ipv4_pktinfo_prepare(skb);
+        if (ip_queue_rcv_skb(sk, skb) < 0) {
-        if (sock_queue_rcv_skb(sk, skb) < 0) {
                kfree_skb(skb);
                return NET_RX_DROP;
        }
@@ -335,7 +326,6 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
        unsigned int iphlen;
        int err;
        struct rtable *rt = *rtp;
-        int hlen, tlen;
        if (length > rt->dst.dev->mtu) {
                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -345,14 +335,12 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
        if (flags&MSG_PROBE)
                goto out;
-        hlen = LL_RESERVED_SPACE(rt->dst.dev);
-        tlen = rt->dst.dev->needed_tailroom;
        skb = sock_alloc_send_skb(sk,
-                                  length + hlen + tlen + 15,
+                                  length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
                                  flags & MSG_DONTWAIT, &err);
        if (skb == NULL)
                goto error;
-        skb_reserve(skb, hlen);
+        skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
@@ -498,8 +486,11 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                if (msg->msg_namelen < sizeof(*usin))
                        goto out;
                if (usin->sin_family != AF_INET) {
-                        pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+                        static int complained;
-                                     __func__, current->comm);
+                        if (!complained++)
+                                printk(KERN_INFO "%s forgot to set AF_INET in "
+                                                 "raw sendmsg. Fix it!\n",
+                                                 current->comm);
                        err = -EAFNOSUPPORT;
                        if (usin->sin_family)
                                goto out;
@@ -567,8 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        ipc.oif = inet->mc_index;
                if (!saddr)
                        saddr = inet->mc_addr;
-        } else if (!ipc.oif)
+        }
-                ipc.oif = inet->uc_index;
        flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
                           RT_SCOPE_UNIVERSE,
@@ -994,9 +984,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
                i, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
-                0, 0L, 0,
+                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
-                from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
-                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 844a9ef60db..b5638545deb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -62,14 +62,14 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define pr_fmt(fmt) "IPv4: " fmt
 #include <linux/module.h>
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -79,6 +79,7 @@
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
+#include <linux/workqueue.h>
 #include <linux/skbuff.h>
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
@@ -86,9 +87,11 @@
 #include <linux/mroute.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
+#include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
@@ -105,8 +108,8 @@
 #include <net/rtnetlink.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
-#include <linux/kmemleak.h>
 #endif
+#include <net/atmclip.h>
 #include <net/secure_seq.h>
 #define RT_FL_TOS(oldflp4) \
@@ -118,7 +121,7 @@
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
-static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
+static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 static int ip_rt_redirect_number __read_mostly  = 9;
 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
@@ -129,6 +132,11 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly       = 256;
+static int rt_chain_length_max __read_mostly    = 20;
+static int redirect_genid;
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
 /*
 *      Interface to generic destination cache.
@@ -136,14 +144,12 @@ static int ip_rt_min_advmss __read_mostly	= 256;
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
-static unsigned int      ipv4_mtu(const struct dst_entry *dst);
+static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
+static void              ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void              ipv4_link_failure(struct sk_buff *skb);
-static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-                                           struct sk_buff *skb, u32 mtu);
+static int rt_garbage_collect(struct dst_ops *ops);
-static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
-                                        struct sk_buff *skb);
-static void             ipv4_dst_destroy(struct dst_entry *dst);
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                            int how)
@@ -152,27 +158,54 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
-        WARN_ON(1);
+        struct rtable *rt = (struct rtable *) dst;
-        return NULL;
+        struct inet_peer *peer;
+        u32 *p = NULL;
+        if (!rt->peer)
+                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = rt->peer;
+        if (peer) {
+                u32 *old_p = __DST_METRICS_PTR(old);
+                unsigned long prev, new;
+                p = peer->metrics;
+                if (inet_metrics_new(peer))
+                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+                new = (unsigned long) p;
+                prev = cmpxchg(&dst->_metrics, old, new);
+                if (prev != old) {
+                        p = __DST_METRICS_PTR(prev);
+                        if (prev & DST_METRICS_READ_ONLY)
+                                p = NULL;
+                } else {
+                        if (rt->fi) {
+                                fib_info_put(rt->fi);
+                                rt->fi = NULL;
+                        }
+                }
+        }
+        return p;
 }
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
-                                           struct sk_buff *skb,
-                                           const void *daddr);
 static struct dst_ops ipv4_dst_ops = {
        .family =               AF_INET,
        .protocol =             cpu_to_be16(ETH_P_IP),
+        .gc =                   rt_garbage_collect,
        .check =                ipv4_dst_check,
        .default_advmss =       ipv4_default_advmss,
-        .mtu =                  ipv4_mtu,
+        .default_mtu =          ipv4_default_mtu,
        .cow_metrics =          ipv4_cow_metrics,
        .destroy =              ipv4_dst_destroy,
        .ifdown =               ipv4_dst_ifdown,
        .negative_advice =      ipv4_negative_advice,
        .link_failure =         ipv4_link_failure,
        .update_pmtu =          ip_rt_update_pmtu,
-        .redirect =             ip_do_redirect,
        .local_out =            __ip_local_out,
        .neigh_lookup =         ipv4_neigh_lookup,
 };
@@ -197,27 +230,186 @@ const __u8 ip_tos2prio[16] = {
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK)
 };
-EXPORT_SYMBOL(ip_tos2prio);
+/*
+ * Route cache.
+ */
+/* The locking scheme is rather straight forward:
+ *
+ * 1) Read-Copy Update protects the buckets of the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ *    as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ *    they do so with atomic increments and with the
+ *    lock held.
+ */
+struct rt_hash_bucket {
+        struct rtable __rcu     *chain;
+};
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+        defined(CONFIG_PROVE_LOCKING)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ * (on lockdep we have a quite big spinlock_t, so keep the size down there)
+ */
+#ifdef CONFIG_LOCKDEP
+# define RT_HASH_LOCK_SZ        256
+#else
+# if NR_CPUS >= 32
+#  define RT_HASH_LOCK_SZ       4096
+# elif NR_CPUS >= 16
+#  define RT_HASH_LOCK_SZ       2048
+# elif NR_CPUS >= 8
+#  define RT_HASH_LOCK_SZ       1024
+# elif NR_CPUS >= 4
+#  define RT_HASH_LOCK_SZ       512
+# else
+#  define RT_HASH_LOCK_SZ       256
+# endif
+#endif
+static spinlock_t       *rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+static __init void rt_hash_lock_init(void)
+{
+        int i;
+        rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
+                        GFP_KERNEL);
+        if (!rt_hash_locks)
+                panic("IP: failed to allocate rt_hash_locks\n");
+        for (i = 0; i < RT_HASH_LOCK_SZ; i++)
+                spin_lock_init(&rt_hash_locks[i]);
+}
+#else
+# define rt_hash_lock_addr(slot) NULL
+static inline void rt_hash_lock_init(void)
+{
+}
+#endif
+static struct rt_hash_bucket    *rt_hash_table __read_mostly;
+static unsigned                 rt_hash_mask __read_mostly;
+static unsigned int             rt_hash_log  __read_mostly;
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
+static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
+                                   int genid)
+{
+        return jhash_3words((__force u32)daddr, (__force u32)saddr,
+                            idx, genid)
+                & rt_hash_mask;
+}
+static inline int rt_genid(struct net *net)
+{
+        return atomic_read(&net->ipv4.rt_genid);
+}
 #ifdef CONFIG_PROC_FS
+struct rt_cache_iter_state {
+        struct seq_net_private p;
+        int bucket;
+        int genid;
+};
+static struct rtable *rt_cache_get_first(struct seq_file *seq)
+{
+        struct rt_cache_iter_state *st = seq->private;
+        struct rtable *r = NULL;
+        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+                if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
+                        continue;
+                rcu_read_lock_bh();
+                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
+                while (r) {
+                        if (dev_net(r->dst.dev) == seq_file_net(seq) &&
+                            r->rt_genid == st->genid)
+                                return r;
+                        r = rcu_dereference_bh(r->dst.rt_next);
+                }
+                rcu_read_unlock_bh();
+        }
+        return r;
+}
+static struct rtable *__rt_cache_get_next(struct seq_file *seq,
+                                          struct rtable *r)
+{
+        struct rt_cache_iter_state *st = seq->private;
+        r = rcu_dereference_bh(r->dst.rt_next);
+        while (!r) {
+                rcu_read_unlock_bh();
+                do {
+                        if (--st->bucket < 0)
+                                return NULL;
+                } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
+                rcu_read_lock_bh();
+                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
+        }
+        return r;
+}
+static struct rtable *rt_cache_get_next(struct seq_file *seq,
+                                        struct rtable *r)
+{
+        struct rt_cache_iter_state *st = seq->private;
+        while ((r = __rt_cache_get_next(seq, r)) != NULL) {
+                if (dev_net(r->dst.dev) != seq_file_net(seq))
+                        continue;
+                if (r->rt_genid == st->genid)
+                        break;
+        }
+        return r;
+}
+static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
+{
+        struct rtable *r = rt_cache_get_first(seq);
+        if (r)
+                while (pos && (r = rt_cache_get_next(seq, r)))
+                        --pos;
+        return pos ? NULL : r;
+}
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
+        struct rt_cache_iter_state *st = seq->private;
        if (*pos)
-                return NULL;
+                return rt_cache_get_idx(seq, *pos - 1);
+        st->genid = rt_genid(seq_file_net(seq));
        return SEQ_START_TOKEN;
 }
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+        struct rtable *r;
+        if (v == SEQ_START_TOKEN)
+                r = rt_cache_get_first(seq);
+        else
+                r = rt_cache_get_next(seq, v);
        ++*pos;
-        return NULL;
+        return r;
 }
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
+        if (v && v != SEQ_START_TOKEN)
+                rcu_read_unlock_bh();
 }
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -227,6 +419,34 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
                           "HHUptod\tSpecDst");
+        else {
+                struct rtable *r = v;
+                struct neighbour *n;
+                int len, HHUptod;
+                rcu_read_lock();
+                n = dst_get_neighbour(&r->dst);
+                HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
+                rcu_read_unlock();
+                seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
+                              "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+                        r->dst.dev ? r->dst.dev->name : "*",
+                        (__force u32)r->rt_dst,
+                        (__force u32)r->rt_gateway,
+                        r->rt_flags, atomic_read(&r->dst.__refcnt),
+                        r->dst.__use, 0, (__force u32)r->rt_src,
+                        dst_metric_advmss(&r->dst) + 40,
+                        dst_metric(&r->dst, RTAX_WINDOW),
+                        (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
+                              dst_metric(&r->dst, RTAX_RTTVAR)),
+                        r->rt_key_tos,
+                        -1,
+                        HHUptod,
+                        r->rt_spec_dst, &len);
+                seq_printf(seq, "%*s\n", 127 - len, "");
+        }
        return 0;
 }
@@ -239,7 +459,8 @@ static const struct seq_operations rt_cache_seq_ops = {
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &rt_cache_seq_ops);
+        return seq_open_net(inode, file, &rt_cache_seq_ops,
+                        sizeof(struct rt_cache_iter_state));
 }
 static const struct file_operations rt_cache_seq_fops = {
@@ -247,7 +468,7 @@ static const struct file_operations rt_cache_seq_fops = {
        .open    = rt_cache_seq_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = seq_release,
+        .release = seq_release_net,
 };
@@ -437,252 +658,791 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
-static inline bool rt_is_expired(const struct rtable *rth)
+static inline void rt_free(struct rtable *rt)
 {
-        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
+        call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
-void rt_cache_flush(struct net *net)
+static inline void rt_drop(struct rtable *rt)
 {
-        rt_genid_bump(net);
+        ip_rt_put(rt);
+        call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
-static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+static inline int rt_fast_clean(struct rtable *rth)
-                                           struct sk_buff *skb,
-                                           const void *daddr)
 {
-        struct net_device *dev = dst->dev;
+        /* Kill broadcast/multicast entries very aggresively, if they
-        const __be32 *pkey = daddr;
+           collide in hash table with more useful entries */
-        const struct rtable *rt;
+        return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-        struct neighbour *n;
+                rt_is_input_route(rth) && rth->dst.rt_next;
+}
-        rt = (const struct rtable *) dst;
+static inline int rt_valuable(struct rtable *rth)
-        if (rt->rt_gateway)
+{
-                pkey = (const __be32 *) &rt->rt_gateway;
+        return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-        else if (skb)
+                (rth->peer && rth->peer->pmtu_expires);
-                pkey = &ip_hdr(skb)->daddr;
+}
-        n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-        if (n)
+{
-                return n;
+        unsigned long age;
-        return neigh_create(&arp_tbl, pkey, dev);
+        int ret = 0;
+        if (atomic_read(&rth->dst.__refcnt))
+                goto out;
+        age = jiffies - rth->dst.lastuse;
+        if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+            (age <= tmo2 && rt_valuable(rth)))
+                goto out;
+        ret = 1;
+out:    return ret;
 }
-/*
+/* Bits of score are:
- * Peer allocation may fail only in serious out-of-memory conditions.  However
+ * 31: very valuable
- * we still can generate some output.
+ * 30: not quite useless
- * Random ID selection looks a bit dangerous because we have no chances to
+ * 29..0: usage counter
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
 */
-static void ip_select_fb_ident(struct iphdr *iph)
+static inline u32 rt_score(struct rtable *rt)
 {
-        static DEFINE_SPINLOCK(ip_fb_id_lock);
+        u32 score = jiffies - rt->dst.lastuse;
-        static u32 ip_fallback_id;
-        u32 salt;
-        spin_lock_bh(&ip_fb_id_lock);
+        score = ~score & ~(3<<30);
-        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
-        iph->id = htons(salt & 0xFFFF);
+        if (rt_valuable(rt))
-        ip_fallback_id = salt;
+                score |= (1<<31);
-        spin_unlock_bh(&ip_fb_id_lock);
+        if (rt_is_output_route(rt) ||
+            !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
+                score |= (1<<30);
+        return score;
 }
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+static inline bool rt_caching(const struct net *net)
 {
-        struct net *net = dev_net(dst->dev);
+        return net->ipv4.current_rt_cache_rebuild_count <=
-        struct inet_peer *peer;
+                net->ipv4.sysctl_rt_cache_rebuild_count;
+}
-        peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
+static inline bool compare_hash_inputs(const struct rtable *rt1,
-        if (peer) {
+                                       const struct rtable *rt2)
-                iph->id = htons(inet_getid(peer, more));
+{
-                inet_putpeer(peer);
+        return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
-                return;
+                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
-        }
+                (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
+}
-        ip_select_fb_ident(iph);
+static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
+{
+        return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
+                ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
+                (rt1->rt_mark ^ rt2->rt_mark) |
+                (rt1->rt_key_tos ^ rt2->rt_key_tos) |
+                (rt1->rt_route_iif ^ rt2->rt_route_iif) |
+                (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 }
-EXPORT_SYMBOL(__ip_select_ident);
-static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-                             const struct iphdr *iph,
+{
-                             int oif, u8 tos,
+        return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-                             u8 prot, u32 mark, int flow_flags)
+}
+static inline int rt_is_expired(struct rtable *rth)
+{
+        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
+}
+/*
+ * Perform a full scan of hash table and free all entries.
+ * Can be called by a softirq or a process.
+ * In the later case, we want to be reschedule if necessary
+ */
+static void rt_do_flush(struct net *net, int process_context)
+{
+        unsigned int i;
+        struct rtable *rth, *next;
+        for (i = 0; i <= rt_hash_mask; i++) {
+                struct rtable __rcu **pprev;
+                struct rtable *list;
+                if (process_context && need_resched())
+                        cond_resched();
+                rth = rcu_dereference_raw(rt_hash_table[i].chain);
+                if (!rth)
+                        continue;
+                spin_lock_bh(rt_hash_lock_addr(i));
+                list = NULL;
+                pprev = &rt_hash_table[i].chain;
+                rth = rcu_dereference_protected(*pprev,
+                        lockdep_is_held(rt_hash_lock_addr(i)));
+                while (rth) {
+                        next = rcu_dereference_protected(rth->dst.rt_next,
+                                lockdep_is_held(rt_hash_lock_addr(i)));
+                        if (!net ||
+                            net_eq(dev_net(rth->dst.dev), net)) {
+                                rcu_assign_pointer(*pprev, next);
+                                rcu_assign_pointer(rth->dst.rt_next, list);
+                                list = rth;
+                        } else {
+                                pprev = &rth->dst.rt_next;
+                        }
+                        rth = next;
+                }
+                spin_unlock_bh(rt_hash_lock_addr(i));
+                for (; list; list = next) {
+                        next = rcu_dereference_protected(list->dst.rt_next, 1);
+                        rt_free(list);
+                }
+        }
+}
+/*
+ * While freeing expired entries, we compute average chain length
+ * and standard deviation, using fixed-point arithmetic.
+ * This to have an estimation of rt_chain_length_max
+ *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
+ * We use 3 bits for frational part, and 29 (or 61) for magnitude.
+ */
+#define FRACT_BITS 3
+#define ONE (1UL << FRACT_BITS)
+/*
+ * Given a hash chain and an item in this hash chain,
+ * find if a previous entry has the same hash_inputs
+ * (but differs on tos, mark or oif)
+ * Returns 0 if an alias is found.
+ * Returns ONE if rth has no alias before itself.
+ */
+static int has_noalias(const struct rtable *head, const struct rtable *rth)
 {
-        if (sk) {
+        const struct rtable *aux = head;
-                const struct inet_sock *inet = inet_sk(sk);
-                oif = sk->sk_bound_dev_if;
+        while (aux != rth) {
-                mark = sk->sk_mark;
+                if (compare_hash_inputs(aux, rth))
-                tos = RT_CONN_FLAGS(sk);
+                        return 0;
-                prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+                aux = rcu_dereference_protected(aux->dst.rt_next, 1);
        }
-        flowi4_init_output(fl4, oif, mark, tos,
+        return ONE;
-                           RT_SCOPE_UNIVERSE, prot,
-                           flow_flags,
-                           iph->daddr, iph->saddr, 0, 0);
 }
-static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+static void rt_check_expire(void)
-                               const struct sock *sk)
 {
-        const struct iphdr *iph = ip_hdr(skb);
+        static unsigned int rover;
-        int oif = skb->dev->ifindex;
+        unsigned int i = rover, goal;
-        u8 tos = RT_TOS(iph->tos);
+        struct rtable *rth;
-        u8 prot = iph->protocol;
+        struct rtable __rcu **rthp;
-        u32 mark = skb->mark;
+        unsigned long samples = 0;
+        unsigned long sum = 0, sum2 = 0;
+        unsigned long delta;
+        u64 mult;
+        delta = jiffies - expires_ljiffies;
+        expires_ljiffies = jiffies;
+        mult = ((u64)delta) << rt_hash_log;
+        if (ip_rt_gc_timeout > 1)
+                do_div(mult, ip_rt_gc_timeout);
+        goal = (unsigned int)mult;
+        if (goal > rt_hash_mask)
+                goal = rt_hash_mask + 1;
+        for (; goal > 0; goal--) {
+                unsigned long tmo = ip_rt_gc_timeout;
+                unsigned long length;
+                i = (i + 1) & rt_hash_mask;
+                rthp = &rt_hash_table[i].chain;
+                if (need_resched())
+                        cond_resched();
+                samples++;
+                if (rcu_dereference_raw(*rthp) == NULL)
+                        continue;
+                length = 0;
+                spin_lock_bh(rt_hash_lock_addr(i));
+                while ((rth = rcu_dereference_protected(*rthp,
+                                        lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+                        prefetch(rth->dst.rt_next);
+                        if (rt_is_expired(rth)) {
+                                *rthp = rth->dst.rt_next;
+                                rt_free(rth);
+                                continue;
+                        }
+                        if (rth->dst.expires) {
+                                /* Entry is expired even if it is in use */
+                                if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+                                        tmo >>= 1;
+                                        rthp = &rth->dst.rt_next;
+                                        /*
+                                         * We only count entries on
+                                         * a chain with equal hash inputs once
+                                         * so that entries for different QOS
+                                         * levels, and other non-hash input
+                                         * attributes don't unfairly skew
+                                         * the length computation
+                                         */
+                                        length += has_noalias(rt_hash_table[i].chain, rth);
+                                        continue;
+                                }
+                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+                                goto nofree;
+                        /* Cleanup aged off entries. */
+                        *rthp = rth->dst.rt_next;
+                        rt_free(rth);
+                }
+                spin_unlock_bh(rt_hash_lock_addr(i));
+                sum += length;
+                sum2 += length*length;
+        }
+        if (samples) {
+                unsigned long avg = sum / samples;
+                unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+                rt_chain_length_max = max_t(unsigned long,
+                                        ip_rt_gc_elasticity,
+                                        (avg + 4*sd) >> FRACT_BITS);
+        }
+        rover = i;
+}
-        __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+        rt_check_expire();
+        schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 }
-static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+/*
+ * Perturbation of rt_genid by a small quantity [1..256]
+ * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
+ * many times (2^24) without giving recent rt_genid.
+ * Jenkins hash is strong enough that litle changes of rt_genid are OK.
+ */
+static void rt_cache_invalidate(struct net *net)
 {
-        const struct inet_sock *inet = inet_sk(sk);
+        unsigned char shuffle;
-        const struct ip_options_rcu *inet_opt;
-        __be32 daddr = inet->inet_daddr;
-        rcu_read_lock();
+        get_random_bytes(&shuffle, sizeof(shuffle));
-        inet_opt = rcu_dereference(inet->inet_opt);
+        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-        if (inet_opt && inet_opt->opt.srr)
+        redirect_genid++;
-                daddr = inet_opt->opt.faddr;
+}
-        flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
-                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+/*
-                           inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ * delay < 0  : invalidate cache (fast : entries will be deleted later)
-                           inet_sk_flowi_flags(sk),
+ * delay >= 0 : invalidate & flush cache (can be long)
-                           daddr, inet->inet_saddr, 0, 0);
+ */
-        rcu_read_unlock();
+void rt_cache_flush(struct net *net, int delay)
+{
+        rt_cache_invalidate(net);
+        if (delay >= 0)
+                rt_do_flush(net, !in_softirq());
 }
-static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+/* Flush previous cache invalidated entries from the cache */
-                                 const struct sk_buff *skb)
+void rt_cache_flush_batch(struct net *net)
 {
-        if (skb)
+        rt_do_flush(net, !in_softirq());
-                build_skb_flow_key(fl4, skb, sk);
-        else
-                build_sk_flow_key(fl4, sk);
 }
-static inline void rt_free(struct rtable *rt)
+static void rt_emergency_hash_rebuild(struct net *net)
 {
-        call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+        if (net_ratelimit())
+                printk(KERN_WARNING "Route hash chain too long!\n");
+        rt_cache_invalidate(net);
 }
-static DEFINE_SPINLOCK(fnhe_lock);
+/*
+   Short description of GC goals.
+   We want to build algorithm, which will keep routing cache
+   at some equilibrium point, when number of aged off entries
+   is kept approximately equal to newly generated ones.
+   Current expiration strength is variable "expire".
+   We try to adjust it dynamically, so that if networking
+   is idle expires is large enough to keep enough of warm entries,
+   and when load increases it reduces to limit cache size.
+ */
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+static int rt_garbage_collect(struct dst_ops *ops)
 {
-        struct fib_nh_exception *fnhe, *oldest;
+        static unsigned long expire = RT_GC_TIMEOUT;
-        struct rtable *orig;
+        static unsigned long last_gc;
+        static int rover;
+        static int equilibrium;
+        struct rtable *rth;
+        struct rtable __rcu **rthp;
+        unsigned long now = jiffies;
+        int goal;
+        int entries = dst_entries_get_fast(&ipv4_dst_ops);
+        /*
+         * Garbage collection is pretty expensive,
+         * do not make it too frequently.
+         */
+        RT_CACHE_STAT_INC(gc_total);
+        if (now - last_gc < ip_rt_gc_min_interval &&
+            entries < ip_rt_max_size) {
+                RT_CACHE_STAT_INC(gc_ignored);
+                goto out;
+        }
+        entries = dst_entries_get_slow(&ipv4_dst_ops);
+        /* Calculate number of entries, which we want to expire now. */
+        goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
+        if (goal <= 0) {
+                if (equilibrium < ipv4_dst_ops.gc_thresh)
+                        equilibrium = ipv4_dst_ops.gc_thresh;
+                goal = entries - equilibrium;
+                if (goal > 0) {
+                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
+                        goal = entries - equilibrium;
+                }
+        } else {
+                /* We are in dangerous area. Try to reduce cache really
+                 * aggressively.
+                 */
+                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
+                equilibrium = entries - goal;
+        }
-        oldest = rcu_dereference(hash->chain);
+        if (now - last_gc >= ip_rt_gc_min_interval)
-        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+                last_gc = now;
-             fnhe = rcu_dereference(fnhe->fnhe_next)) {
-                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+        if (goal <= 0) {
-                        oldest = fnhe;
+                equilibrium += goal;
+                goto work_done;
        }
-        orig = rcu_dereference(oldest->fnhe_rth);
-        if (orig) {
+        do {
-                RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
+                int i, k;
-                rt_free(orig);
+                for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+                        unsigned long tmo = expire;
+                        k = (k + 1) & rt_hash_mask;
+                        rthp = &rt_hash_table[k].chain;
+                        spin_lock_bh(rt_hash_lock_addr(k));
+                        while ((rth = rcu_dereference_protected(*rthp,
+                                        lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
+                                if (!rt_is_expired(rth) &&
+                                        !rt_may_expire(rth, tmo, expire)) {
+                                        tmo >>= 1;
+                                        rthp = &rth->dst.rt_next;
+                                        continue;
+                                }
+                                *rthp = rth->dst.rt_next;
+                                rt_free(rth);
+                                goal--;
+                        }
+                        spin_unlock_bh(rt_hash_lock_addr(k));
+                        if (goal <= 0)
+                                break;
+                }
+                rover = k;
+                if (goal <= 0)
+                        goto work_done;
+                /* Goal is not achieved. We stop process if:
+                   - if expire reduced to zero. Otherwise, expire is halfed.
+                   - if table is not full.
+                   - if we are called from interrupt.
+                   - jiffies check is just fallback/debug loop breaker.
+                     We will not spin here for long time in any case.
+                 */
+                RT_CACHE_STAT_INC(gc_goal_miss);
+                if (expire == 0)
+                        break;
+                expire >>= 1;
+                if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+                        goto out;
+        } while (!in_softirq() && time_before_eq(jiffies, now));
+        if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+                goto out;
+        if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
+                goto out;
+        if (net_ratelimit())
+                printk(KERN_WARNING "dst cache overflow\n");
+        RT_CACHE_STAT_INC(gc_dst_overflow);
+        return 1;
+work_done:
+        expire += ip_rt_gc_min_interval;
+        if (expire > ip_rt_gc_timeout ||
+            dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+            dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
+                expire = ip_rt_gc_timeout;
+out:    return 0;
+}
+/*
+ * Returns number of entries in a hash chain that have different hash_inputs
+ */
+static int slow_chain_length(const struct rtable *head)
+{
+        int length = 0;
+        const struct rtable *rth = head;
+        while (rth) {
+                length += has_noalias(head, rth);
+                rth = rcu_dereference_protected(rth->dst.rt_next, 1);
        }
-        return oldest;
+        return length >> FRACT_BITS;
 }
-static inline u32 fnhe_hashfun(__be32 daddr)
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 {
-        u32 hval;
+        struct neigh_table *tbl = &arp_tbl;
+        static const __be32 inaddr_any = 0;
+        struct net_device *dev = dst->dev;
+        const __be32 *pkey = daddr;
+        struct neighbour *n;
-        hval = (__force u32) daddr;
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-        hval ^= (hval >> 11) ^ (hval >> 22);
+        if (dev->type == ARPHRD_ATM)
+                tbl = clip_tbl_hook;
+#endif
+        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+                pkey = &inaddr_any;
-        return hval & (FNHE_HASH_SIZE - 1);
+        n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
+        if (n)
+                return n;
+        return neigh_create(tbl, pkey, dev);
 }
-static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+static int rt_bind_neighbour(struct rtable *rt)
-                                  u32 pmtu, unsigned long expires)
 {
-        struct fnhe_hash_bucket *hash;
+        struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
-        struct fib_nh_exception *fnhe;
+        if (IS_ERR(n))
-        int depth;
+                return PTR_ERR(n);
-        u32 hval = fnhe_hashfun(daddr);
+        dst_set_neighbour(&rt->dst, n);
-        spin_lock_bh(&fnhe_lock);
+        return 0;
+}
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
+                                     struct sk_buff *skb, int ifindex)
+{
+        struct rtable   *rth, *cand;
+        struct rtable __rcu **rthp, **candp;
+        unsigned long   now;
+        u32             min_score;
+        int             chain_length;
+        int attempts = !in_softirq();
-        hash = nh->nh_exceptions;
+restart:
-        if (!hash) {
+        chain_length = 0;
-                hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
+        min_score = ~(u32)0;
-                if (!hash)
+        cand = NULL;
-                        goto out_unlock;
+        candp = NULL;
-                nh->nh_exceptions = hash;
+        now = jiffies;
+        if (!rt_caching(dev_net(rt->dst.dev))) {
+                /*
+                 * If we're not caching, just tell the caller we
+                 * were successful and don't touch the route.  The
+                 * caller hold the sole reference to the cache entry, and
+                 * it will be released when the caller is done with it.
+                 * If we drop it here, the callers have no way to resolve routes
+                 * when we're not caching.  Instead, just point *rp at rt, so
+                 * the caller gets a single use out of the route
+                 * Note that we do rt_free on this new route entry, so that
+                 * once its refcount hits zero, we are still able to reap it
+                 * (Thanks Alexey)
+                 * Note: To avoid expensive rcu stuff for this uncached dst,
+                 * we set DST_NOCACHE so that dst_release() can free dst without
+                 * waiting a grace period.
+                 */
+                rt->dst.flags |= DST_NOCACHE;
+                if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
+                        int err = rt_bind_neighbour(rt);
+                        if (err) {
+                                if (net_ratelimit())
+                                        printk(KERN_WARNING
+                                            "Neighbour table failure & not caching routes.\n");
+                                ip_rt_put(rt);
+                                return ERR_PTR(err);
+                        }
+                }
+                goto skip_hashing;
        }
-        hash += hval;
+        rthp = &rt_hash_table[hash].chain;
-        depth = 0;
+        spin_lock_bh(rt_hash_lock_addr(hash));
-        for (fnhe = rcu_dereference(hash->chain); fnhe;
+        while ((rth = rcu_dereference_protected(*rthp,
-             fnhe = rcu_dereference(fnhe->fnhe_next)) {
+                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-                if (fnhe->fnhe_daddr == daddr)
+                if (rt_is_expired(rth)) {
-                        break;
+                        *rthp = rth->dst.rt_next;
-                depth++;
+                        rt_free(rth);
+                        continue;
+                }
+                if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
+                        /* Put it first */
+                        *rthp = rth->dst.rt_next;
+                        /*
+                         * Since lookup is lockfree, the deletion
+                         * must be visible to another weakly ordered CPU before
+                         * the insertion at the start of the hash chain.
+                         */
+                        rcu_assign_pointer(rth->dst.rt_next,
+                                           rt_hash_table[hash].chain);
+                        /*
+                         * Since lookup is lockfree, the update writes
+                         * must be ordered for consistency on SMP.
+                         */
+                        rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+                        dst_use(&rth->dst, now);
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
+                        rt_drop(rt);
+                        if (skb)
+                                skb_dst_set(skb, &rth->dst);
+                        return rth;
+                }
+                if (!atomic_read(&rth->dst.__refcnt)) {
+                        u32 score = rt_score(rth);
+                        if (score <= min_score) {
+                                cand = rth;
+                                candp = rthp;
+                                min_score = score;
+                        }
+                }
+                chain_length++;
+                rthp = &rth->dst.rt_next;
        }
-        if (fnhe) {
+        if (cand) {
-                if (gw)
+                /* ip_rt_gc_elasticity used to be average length of chain
-                        fnhe->fnhe_gw = gw;
+                 * length, when exceeded gc becomes really aggressive.
-                if (pmtu) {
+                 *
-                        fnhe->fnhe_pmtu = pmtu;
+                 * The second limit is less certain. At the moment it allows
-                        fnhe->fnhe_expires = expires;
+                 * only 2 entries per bucket. We will see.
+                 */
+                if (chain_length > ip_rt_gc_elasticity) {
+                        *candp = cand->dst.rt_next;
+                        rt_free(cand);
                }
        } else {
-                if (depth > FNHE_RECLAIM_DEPTH)
+                if (chain_length > rt_chain_length_max &&
-                        fnhe = fnhe_oldest(hash);
+                    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-                else {
+                        struct net *net = dev_net(rt->dst.dev);
-                        fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+                        int num = ++net->ipv4.current_rt_cache_rebuild_count;
-                        if (!fnhe)
+                        if (!rt_caching(net)) {
-                                goto out_unlock;
+                                printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
+                                        rt->dst.dev->name, num);
-                        fnhe->fnhe_next = hash->chain;
+                        }
-                        rcu_assign_pointer(hash->chain, fnhe);
+                        rt_emergency_hash_rebuild(net);
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
+                        hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+                                        ifindex, rt_genid(net));
+                        goto restart;
                }
-                fnhe->fnhe_daddr = daddr;
-                fnhe->fnhe_gw = gw;
-                fnhe->fnhe_pmtu = pmtu;
-                fnhe->fnhe_expires = expires;
        }
-        fnhe->fnhe_stamp = jiffies;
+        /* Try to bind route to arp only if it is output
+           route or unicast forwarding path.
+         */
+        if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
+                int err = rt_bind_neighbour(rt);
+                if (err) {
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
+                        if (err != -ENOBUFS) {
+                                rt_drop(rt);
+                                return ERR_PTR(err);
+                        }
-out_unlock:
+                        /* Neighbour tables are full and nothing
-        spin_unlock_bh(&fnhe_lock);
+                           can be released. Try to shrink route cache,
-        return;
+                           it is most likely it holds some neighbour records.
+                         */
+                        if (attempts-- > 0) {
+                                int saved_elasticity = ip_rt_gc_elasticity;
+                                int saved_int = ip_rt_gc_min_interval;
+                                ip_rt_gc_elasticity     = 1;
+                                ip_rt_gc_min_interval   = 0;
+                                rt_garbage_collect(&ipv4_dst_ops);
+                                ip_rt_gc_min_interval   = saved_int;
+                                ip_rt_gc_elasticity     = saved_elasticity;
+                                goto restart;
+                        }
+                        if (net_ratelimit())
+                                printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
+                        rt_drop(rt);
+                        return ERR_PTR(-ENOBUFS);
+                }
+        }
+        rt->dst.rt_next = rt_hash_table[hash].chain;
+        /*
+         * Since lookup is lockfree, we must make sure
+         * previous writes to rt are committed to memory
+         * before making rt visible to other CPUS.
+         */
+        rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+        spin_unlock_bh(rt_hash_lock_addr(hash));
+skip_hashing:
+        if (skb)
+                skb_dst_set(skb, &rt->dst);
+        return rt;
 }
-static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
-                             bool kill_route)
+static u32 rt_peer_genid(void)
 {
-        __be32 new_gw = icmp_hdr(skb)->un.gateway;
+        return atomic_read(&__rt_peer_genid);
-        __be32 old_gw = ip_hdr(skb)->saddr;
+}
-        struct net_device *dev = skb->dev;
-        struct in_device *in_dev;
-        struct fib_result res;
-        struct neighbour *n;
-        struct net *net;
-        switch (icmp_hdr(skb)->code & 7) {
+void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
-        case ICMP_REDIR_NET:
+{
-        case ICMP_REDIR_NETTOS:
+        struct inet_peer *peer;
-        case ICMP_REDIR_HOST:
-        case ICMP_REDIR_HOSTTOS:
-                break;
-        default:
+        peer = inet_getpeer_v4(daddr, create);
-                return;
+        if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
+                inet_putpeer(peer);
+        else
+                rt->rt_peer_genid = rt_peer_genid();
+}
+/*
+ * Peer allocation may fail only in serious out-of-memory conditions.  However
+ * we still can generate some output.
+ * Random ID selection looks a bit dangerous because we have no chances to
+ * select ID being unique in a reasonable period of time.
+ * But broken packet identifier may be better than no packet at all.
+ */
+static void ip_select_fb_ident(struct iphdr *iph)
+{
+        static DEFINE_SPINLOCK(ip_fb_id_lock);
+        static u32 ip_fallback_id;
+        u32 salt;
+        spin_lock_bh(&ip_fb_id_lock);
+        salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
+        iph->id = htons(salt & 0xFFFF);
+        ip_fallback_id = salt;
+        spin_unlock_bh(&ip_fb_id_lock);
+}
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        if (rt && !(rt->dst.flags & DST_NOPEER)) {
+                if (rt->peer == NULL)
+                        rt_bind_peer(rt, rt->rt_dst, 1);
+                /* If peer is attached to destination, it is never detached,
+                   so that we need not to grab a lock to dereference it.
+                 */
+                if (rt->peer) {
+                        iph->id = htons(inet_getid(rt->peer, more));
+                        return;
+                }
+        } else if (!rt)
+                printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
+                       __builtin_return_address(0));
+        ip_select_fb_ident(iph);
+}
+EXPORT_SYMBOL(__ip_select_ident);
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+        struct rtable __rcu **rthp;
+        struct rtable *aux;
+        rthp = &rt_hash_table[hash].chain;
+        spin_lock_bh(rt_hash_lock_addr(hash));
+        ip_rt_put(rt);
+        while ((aux = rcu_dereference_protected(*rthp,
+                        lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
+                if (aux == rt || rt_is_expired(aux)) {
+                        *rthp = aux->dst.rt_next;
+                        rt_free(aux);
+                        continue;
+                }
+                rthp = &aux->dst.rt_next;
        }
+        spin_unlock_bh(rt_hash_lock_addr(hash));
+}
+static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        __be32 orig_gw = rt->rt_gateway;
+        struct neighbour *n, *old_n;
+        dst_confirm(&rt->dst);
-        if (rt->rt_gateway != old_gw)
+        rt->rt_gateway = peer->redirect_learned.a4;
+        n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
+        if (IS_ERR(n)) {
+                rt->rt_gateway = orig_gw;
                return;
+        }
+        old_n = xchg(&rt->dst._neighbour, n);
+        if (old_n)
+                neigh_release(old_n);
+        if (!(n->nud_state & NUD_VALID)) {
+                neigh_event_send(n, NULL);
+        } else {
+                rt->rt_flags |= RTCF_REDIRECTED;
+                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+        }
+}
+/* called in rcu_read_lock() section */
+void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
+                    __be32 saddr, struct net_device *dev)
+{
+        int s, i;
+        struct in_device *in_dev = __in_dev_get_rcu(dev);
+        __be32 skeys[2] = { saddr, 0 };
+        int    ikeys[2] = { dev->ifindex, 0 };
+        struct inet_peer *peer;
+        struct net *net;
-        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                return;
@@ -702,50 +1462,74 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
                        goto reject_redirect;
        }
-        n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+        for (s = 0; s < 2; s++) {
-        if (n) {
+                for (i = 0; i < 2; i++) {
-                if (!(n->nud_state & NUD_VALID)) {
+                        unsigned int hash;
-                        neigh_event_send(n, NULL);
+                        struct rtable __rcu **rthp;
-                } else {
+                        struct rtable *rt;
-                        if (fib_lookup(net, fl4, &res) == 0) {
-                                struct fib_nh *nh = &FIB_RES_NH(res);
+                        hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
-                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
+                        rthp = &rt_hash_table[hash].chain;
-                                                      0, 0);
+                        while ((rt = rcu_dereference(*rthp)) != NULL) {
+                                rthp = &rt->dst.rt_next;
+                                if (rt->rt_key_dst != daddr ||
+                                    rt->rt_key_src != skeys[s] ||
+                                    rt->rt_oif != ikeys[i] ||
+                                    rt_is_input_route(rt) ||
+                                    rt_is_expired(rt) ||
+                                    !net_eq(dev_net(rt->dst.dev), net) ||
+                                    rt->dst.error ||
+                                    rt->dst.dev != dev ||
+                                    rt->rt_gateway != old_gw)
+                                        continue;
+                                if (!rt->peer)
+                                        rt_bind_peer(rt, rt->rt_dst, 1);
+                                peer = rt->peer;
+                                if (peer) {
+                                        if (peer->redirect_learned.a4 != new_gw ||
+                                            peer->redirect_genid != redirect_genid) {
+                                                peer->redirect_learned.a4 = new_gw;
+                                                peer->redirect_genid = redirect_genid;
+                                                atomic_inc(&__rt_peer_genid);
+                                        }
+                                        check_peer_redir(&rt->dst, peer);
+                                }
                        }
-                        if (kill_route)
-                                rt->dst.obsolete = DST_OBSOLETE_KILL;
-                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
                }
-                neigh_release(n);
        }
        return;
 reject_redirect:
 #ifdef CONFIG_IP_ROUTE_VERBOSE
-        if (IN_DEV_LOG_MARTIANS(in_dev)) {
+        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-                const struct iphdr *iph = (const struct iphdr *) skb->data;
+                printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
-                __be32 daddr = iph->daddr;
+                        "  Advised path = %pI4 -> %pI4\n",
-                __be32 saddr = iph->saddr;
+                       &old_gw, dev->name, &new_gw,
+                       &saddr, &daddr);
-                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
-                                     "  Advised path = %pI4 -> %pI4\n",
-                                     &old_gw, dev->name, &new_gw,
-                                     &saddr, &daddr);
-        }
 #endif
        ;
 }
-static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+static bool peer_pmtu_expired(struct inet_peer *peer)
 {
-        struct rtable *rt;
+        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-        struct flowi4 fl4;
-        rt = (struct rtable *) dst;
+        return orig &&
+               time_after_eq(jiffies, orig) &&
+               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
+}
+static bool peer_pmtu_cleaned(struct inet_peer *peer)
+{
+        unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-        ip_rt_build_flow_key(&fl4, sk, skb);
+        return orig &&
-        __ip_do_redirect(rt, skb, &fl4, true);
+               cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
 }
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -757,10 +1541,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
                if (dst->obsolete > 0) {
                        ip_rt_put(rt);
                        ret = NULL;
-                } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+                } else if (rt->rt_flags & RTCF_REDIRECTED) {
-                           rt->dst.expires) {
+                        unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
-                        ip_rt_put(rt);
+                                                rt->rt_oif,
+                                                rt_genid(dev_net(dst->dev)));
+                        rt_del(hash, rt);
                        ret = NULL;
+                } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
+                        dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
                }
        }
        return ret;
@@ -787,7 +1575,6 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        struct rtable *rt = skb_rtable(skb);
        struct in_device *in_dev;
        struct inet_peer *peer;
-        struct net *net;
        int log_martians;
        rcu_read_lock();
@@ -799,11 +1586,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
        rcu_read_unlock();
-        net = dev_net(rt->dst.dev);
+        if (!rt->peer)
-        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = rt->peer;
        if (!peer) {
-                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
-                          rt_nexthop(rt, ip_hdr(skb)->daddr));
                return;
        }
@@ -818,7 +1605,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
         */
        if (peer->rate_tokens >= ip_rt_redirect_number) {
                peer->rate_last = jiffies;
-                goto out_put_peer;
+                return;
        }
        /* Check for load limit; set rate_last to the latest sent
@@ -828,47 +1615,28 @@ void ip_rt_send_redirect(struct sk_buff *skb)
            time_after(jiffies,
                       (peer->rate_last +
                        (ip_rt_redirect_load << peer->rate_tokens)))) {
-                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
-                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
                if (log_martians &&
-                    peer->rate_tokens == ip_rt_redirect_number)
+                    peer->rate_tokens == ip_rt_redirect_number &&
-                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+                    net_ratelimit())
-                                             &ip_hdr(skb)->saddr, inet_iif(skb),
+                        printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
-                                             &ip_hdr(skb)->daddr, &gw);
+                               &ip_hdr(skb)->saddr, rt->rt_iif,
+                                &rt->rt_dst, &rt->rt_gateway);
 #endif
        }
-out_put_peer:
-        inet_putpeer(peer);
 }
 static int ip_error(struct sk_buff *skb)
 {
-        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
        struct rtable *rt = skb_rtable(skb);
        struct inet_peer *peer;
        unsigned long now;
-        struct net *net;
        bool send;
        int code;
-        net = dev_net(rt->dst.dev);
-        if (!IN_DEV_FORWARD(in_dev)) {
-                switch (rt->dst.error) {
-                case EHOSTUNREACH:
-                        IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
-                        break;
-                case ENETUNREACH:
-                        IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
-                        break;
-                }
-                goto out;
-        }
        switch (rt->dst.error) {
        case EINVAL:
        default:
@@ -878,14 +1646,17 @@ static int ip_error(struct sk_buff *skb)
                break;
        case ENETUNREACH:
                code = ICMP_NET_UNREACH;
-                IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+                IP_INC_STATS_BH(dev_net(rt->dst.dev),
+                                IPSTATS_MIB_INNOROUTES);
                break;
        case EACCES:
                code = ICMP_PKT_FILTERED;
                break;
        }
-        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+        if (!rt->peer)
+                rt_bind_peer(rt, rt->rt_dst, 1);
+        peer = rt->peer;
        send = true;
        if (peer) {
@@ -898,7 +1669,6 @@ static int ip_error(struct sk_buff *skb)
                        peer->rate_tokens -= ip_rt_error_cost;
                else
                        send = false;
-                inet_putpeer(peer);
        }
        if (send)
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -907,125 +1677,165 @@ out:	kfree_skb(skb);
        return 0;
 }
-static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+/*
+ *      The last two values are not from the RFC but
+ *      are needed for AMPRnet AX.25 paths.
+ */
+static const unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+static inline unsigned short guess_mtu(unsigned short old_mtu)
 {
-        struct dst_entry *dst = &rt->dst;
+        int i;
-        struct fib_result res;
-        if (dst->dev->mtu < mtu)
+        for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
-                return;
+                if (old_mtu > mtu_plateau[i])
+                        return mtu_plateau[i];
+        return 68;
+}
-        if (mtu < ip_rt_min_pmtu)
+unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
-                mtu = ip_rt_min_pmtu;
+                                 unsigned short new_mtu,
+                                 struct net_device *dev)
+{
+        unsigned short old_mtu = ntohs(iph->tot_len);
+        unsigned short est_mtu = 0;
+        struct inet_peer *peer;
-        if (!rt->rt_pmtu) {
+        peer = inet_getpeer_v4(iph->daddr, 1);
-                dst->obsolete = DST_OBSOLETE_KILL;
+        if (peer) {
-        } else {
+                unsigned short mtu = new_mtu;
-                rt->rt_pmtu = mtu;
-                dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
-        }
-        rcu_read_lock();
+                if (new_mtu < 68 || new_mtu >= old_mtu) {
-        if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+                        /* BSD 4.2 derived systems incorrectly adjust
-                struct fib_nh *nh = &FIB_RES_NH(res);
+                         * tot_len by the IP header length, and report
+                         * a zero MTU in the ICMP message.
+                         */
+                        if (mtu == 0 &&
+                            old_mtu >= 68 + (iph->ihl << 2))
+                                old_mtu -= iph->ihl << 2;
+                        mtu = guess_mtu(old_mtu);
+                }
+                if (mtu < ip_rt_min_pmtu)
+                        mtu = ip_rt_min_pmtu;
+                if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+                        unsigned long pmtu_expires;
-                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+                        pmtu_expires = jiffies + ip_rt_mtu_expires;
-                                      jiffies + ip_rt_mtu_expires);
+                        if (!pmtu_expires)
+                                pmtu_expires = 1UL;
+                        est_mtu = mtu;
+                        peer->pmtu_learned = mtu;
+                        peer->pmtu_expires = pmtu_expires;
+                        atomic_inc(&__rt_peer_genid);
+                }
+                inet_putpeer(peer);
        }
-        rcu_read_unlock();
+        return est_mtu ? : new_mtu;
 }
-static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
-                              struct sk_buff *skb, u32 mtu)
 {
-        struct rtable *rt = (struct rtable *) dst;
+        unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
-        struct flowi4 fl4;
-        ip_rt_build_flow_key(&fl4, sk, skb);
+        if (!expires)
-        __ip_rt_update_pmtu(rt, &fl4, mtu);
+                return;
+        if (time_before(jiffies, expires)) {
+                u32 orig_dst_mtu = dst_mtu(dst);
+                if (peer->pmtu_learned < orig_dst_mtu) {
+                        if (!peer->pmtu_orig)
+                                peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+                        dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
+                }
+        } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
+                dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
 }
-void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
-                      int oif, u32 mark, u8 protocol, int flow_flags)
 {
-        const struct iphdr *iph = (const struct iphdr *) skb->data;
+        struct rtable *rt = (struct rtable *) dst;
-        struct flowi4 fl4;
+        struct inet_peer *peer;
-        struct rtable *rt;
-        __build_flow_key(&fl4, NULL, iph, oif,
+        dst_confirm(dst);
-                         RT_TOS(iph->tos), protocol, mark, flow_flags);
-        rt = __ip_route_output_key(net, &fl4);
-        if (!IS_ERR(rt)) {
-                __ip_rt_update_pmtu(rt, &fl4, mtu);
-                ip_rt_put(rt);
-        }
-}
-EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
-void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+        if (!rt->peer)
-{
+                rt_bind_peer(rt, rt->rt_dst, 1);
-        const struct iphdr *iph = (const struct iphdr *) skb->data;
+        peer = rt->peer;
-        struct flowi4 fl4;
+        if (peer) {
-        struct rtable *rt;
+                unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
-        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+                if (mtu < ip_rt_min_pmtu)
-        rt = __ip_route_output_key(sock_net(sk), &fl4);
+                        mtu = ip_rt_min_pmtu;
-        if (!IS_ERR(rt)) {
+                if (!pmtu_expires || mtu < peer->pmtu_learned) {
-                __ip_rt_update_pmtu(rt, &fl4, mtu);
-                ip_rt_put(rt);
-        }
-}
-EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
-void ipv4_redirect(struct sk_buff *skb, struct net *net,
+                        pmtu_expires = jiffies + ip_rt_mtu_expires;
-                   int oif, u32 mark, u8 protocol, int flow_flags)
+                        if (!pmtu_expires)
-{
+                                pmtu_expires = 1UL;
-        const struct iphdr *iph = (const struct iphdr *) skb->data;
-        struct flowi4 fl4;
-        struct rtable *rt;
-        __build_flow_key(&fl4, NULL, iph, oif,
+                        peer->pmtu_learned = mtu;
-                         RT_TOS(iph->tos), protocol, mark, flow_flags);
+                        peer->pmtu_expires = pmtu_expires;
-        rt = __ip_route_output_key(net, &fl4);
-        if (!IS_ERR(rt)) {
+                        atomic_inc(&__rt_peer_genid);
-                __ip_do_redirect(rt, skb, &fl4, false);
+                        rt->rt_peer_genid = rt_peer_genid();
-                ip_rt_put(rt);
+                }
+                check_peer_pmtu(dst, peer);
        }
 }
-EXPORT_SYMBOL_GPL(ipv4_redirect);
-void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+static void ipv4_validate_peer(struct rtable *rt)
 {
-        const struct iphdr *iph = (const struct iphdr *) skb->data;
+        if (rt->rt_peer_genid != rt_peer_genid()) {
-        struct flowi4 fl4;
+                struct inet_peer *peer;
-        struct rtable *rt;
+                if (!rt->peer)
+                        rt_bind_peer(rt, rt->rt_dst, 0);
-        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+                peer = rt->peer;
-        rt = __ip_route_output_key(sock_net(sk), &fl4);
+                if (peer) {
-        if (!IS_ERR(rt)) {
+                        check_peer_pmtu(&rt->dst, peer);
-                __ip_do_redirect(rt, skb, &fl4, false);
-                ip_rt_put(rt);
+                        if (peer->redirect_genid != redirect_genid)
+                                peer->redirect_learned.a4 = 0;
+                        if (peer->redirect_learned.a4 &&
+                            peer->redirect_learned.a4 != rt->rt_gateway)
+                                check_peer_redir(&rt->dst, peer);
+                }
+                rt->rt_peer_genid = rt_peer_genid();
        }
 }
-EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
        struct rtable *rt = (struct rtable *) dst;
-        /* All IPV4 dsts are created with ->obsolete set to the value
+        if (rt_is_expired(rt))
-         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
-         * into this function always.
-         *
-         * When a PMTU/redirect information update invalidates a
-         * route, this is indicated by setting obsolete to
-         * DST_OBSOLETE_KILL.
-         */
-        if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
                return NULL;
+        ipv4_validate_peer(rt);
        return dst;
 }
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+        struct rtable *rt = (struct rtable *) dst;
+        struct inet_peer *peer = rt->peer;
+        if (rt->fi) {
+                fib_info_put(rt->fi);
+                rt->fi = NULL;
+        }
+        if (peer) {
+                rt->peer = NULL;
+                inet_putpeer(peer);
+        }
+}
 static void ipv4_link_failure(struct sk_buff *skb)
 {
        struct rtable *rt;
@@ -1033,15 +1843,15 @@ static void ipv4_link_failure(struct sk_buff *skb)
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
        rt = skb_rtable(skb);
-        if (rt)
+        if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
-                dst_set_expires(&rt->dst, 0);
+                dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
 }
 static int ip_rt_bug(struct sk_buff *skb)
 {
-        pr_debug("%s: %pI4 -> %pI4, %s\n",
+        printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
-                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+                &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
-                 skb->dev ? skb->dev->name : "?");
+                skb->dev ? skb->dev->name : "?");
        kfree_skb(skb);
        WARN_ON(1);
        return 0;
@@ -1081,9 +1891,8 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
                else
-                        src = inet_select_addr(rt->dst.dev,
+                        src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
-                                               rt_nexthop(rt, iph->daddr),
+                                        RT_SCOPE_UNIVERSE);
-                                               RT_SCOPE_UNIVERSE);
                rcu_read_unlock();
        }
        memcpy(addr, &src, 4);
@@ -1112,21 +1921,14 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
        return advmss;
 }
-static unsigned int ipv4_mtu(const struct dst_entry *dst)
+static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
 {
-        const struct rtable *rt = (const struct rtable *) dst;
+        unsigned int mtu = dst->dev->mtu;
-        unsigned int mtu = rt->rt_pmtu;
-        if (!mtu || time_after_eq(jiffies, rt->dst.expires))
-                mtu = dst_metric_raw(dst, RTAX_MTU);
-        if (mtu && rt_is_output_route(rt))
-                return mtu;
-        mtu = dst->dev->mtu;
        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-                if (rt->rt_uses_gateway && mtu > 576)
+                const struct rtable *rt = (const struct rtable *) dst;
+                if (rt->rt_gateway != rt->rt_dst && mtu > 576)
                        mtu = 576;
        }
@@ -1136,184 +1938,77 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        return mtu;
 }
-static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
+                            struct fib_info *fi)
 {
-        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+        struct inet_peer *peer;
-        struct fib_nh_exception *fnhe;
+        int create = 0;
-        u32 hval;
-        if (!hash)
-                return NULL;
-        hval = fnhe_hashfun(daddr);
-        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
-             fnhe = rcu_dereference(fnhe->fnhe_next)) {
-                if (fnhe->fnhe_daddr == daddr)
-                        return fnhe;
-        }
-        return NULL;
-}
-static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
-                              __be32 daddr)
-{
-        bool ret = false;
-        spin_lock_bh(&fnhe_lock);
-        if (daddr == fnhe->fnhe_daddr) {
+        /* If a peer entry exists for this destination, we must hook
-                struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
+         * it up in order to get at cached metrics.
-                if (orig && rt_is_expired(orig)) {
+         */
-                        fnhe->fnhe_gw = 0;
+        if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
-                        fnhe->fnhe_pmtu = 0;
+                create = 1;
-                        fnhe->fnhe_expires = 0;
-                }
-                if (fnhe->fnhe_pmtu) {
-                        unsigned long expires = fnhe->fnhe_expires;
-                        unsigned long diff = expires - jiffies;
-                        if (time_before(jiffies, expires)) {
+        rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
-                                rt->rt_pmtu = fnhe->fnhe_pmtu;
+        if (peer) {
-                                dst_set_expires(&rt->dst, diff);
+                rt->rt_peer_genid = rt_peer_genid();
-                        }
+                if (inet_metrics_new(peer))
-                }
+                        memcpy(peer->metrics, fi->fib_metrics,
-                if (fnhe->fnhe_gw) {
+                               sizeof(u32) * RTAX_MAX);
+                dst_init_metrics(&rt->dst, peer->metrics, false);
+                check_peer_pmtu(&rt->dst, peer);
+                if (peer->redirect_genid != redirect_genid)
+                        peer->redirect_learned.a4 = 0;
+                if (peer->redirect_learned.a4 &&
+                    peer->redirect_learned.a4 != rt->rt_gateway) {
+                        rt->rt_gateway = peer->redirect_learned.a4;
                        rt->rt_flags |= RTCF_REDIRECTED;
-                        rt->rt_gateway = fnhe->fnhe_gw;
+                }
-                        rt->rt_uses_gateway = 1;
-                } else if (!rt->rt_gateway)
-                        rt->rt_gateway = daddr;
-                rcu_assign_pointer(fnhe->fnhe_rth, rt);
-                if (orig)
-                        rt_free(orig);
-                fnhe->fnhe_stamp = jiffies;
-                ret = true;
-        }
-        spin_unlock_bh(&fnhe_lock);
-        return ret;
-}
-static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
-{
-        struct rtable *orig, *prev, **p;
-        bool ret = true;
-        if (rt_is_input_route(rt)) {
-                p = (struct rtable **)&nh->nh_rth_input;
        } else {
-                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+                if (fi->fib_metrics != (u32 *) dst_default_metrics) {
-        }
+                        rt->fi = fi;
-        orig = *p;
+                        atomic_inc(&fi->fib_clntref);
-        prev = cmpxchg(p, orig, rt);
-        if (prev == orig) {
-                if (orig)
-                        rt_free(orig);
-        } else
-                ret = false;
-        return ret;
-}
-static DEFINE_SPINLOCK(rt_uncached_lock);
-static LIST_HEAD(rt_uncached_list);
-static void rt_add_uncached_list(struct rtable *rt)
-{
-        spin_lock_bh(&rt_uncached_lock);
-        list_add_tail(&rt->rt_uncached, &rt_uncached_list);
-        spin_unlock_bh(&rt_uncached_lock);
-}
-static void ipv4_dst_destroy(struct dst_entry *dst)
-{
-        struct rtable *rt = (struct rtable *) dst;
-        if (!list_empty(&rt->rt_uncached)) {
-                spin_lock_bh(&rt_uncached_lock);
-                list_del(&rt->rt_uncached);
-                spin_unlock_bh(&rt_uncached_lock);
-        }
-}
-void rt_flush_dev(struct net_device *dev)
-{
-        if (!list_empty(&rt_uncached_list)) {
-                struct net *net = dev_net(dev);
-                struct rtable *rt;
-                spin_lock_bh(&rt_uncached_lock);
-                list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
-                        if (rt->dst.dev != dev)
-                                continue;
-                        rt->dst.dev = net->loopback_dev;
-                        dev_hold(rt->dst.dev);
-                        dev_put(dev);
                }
-                spin_unlock_bh(&rt_uncached_lock);
+                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
        }
 }
-static bool rt_cache_valid(const struct rtable *rt)
+static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
-{
-        return  rt &&
-                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-                !rt_is_expired(rt);
-}
-static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
-                           struct fib_nh_exception *fnhe,
                           struct fib_info *fi, u16 type, u32 itag)
 {
-        bool cached = false;
+        struct dst_entry *dst = &rt->dst;
        if (fi) {
-                struct fib_nh *nh = &FIB_RES_NH(*res);
+                if (FIB_RES_GW(*res) &&
+                    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-                if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
+                        rt->rt_gateway = FIB_RES_GW(*res);
-                        rt->rt_gateway = nh->nh_gw;
+                rt_init_metrics(rt, fl4, fi);
-                        rt->rt_uses_gateway = 1;
-                }
-                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-                rt->dst.tclassid = nh->nh_tclassid;
+                dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
 #endif
-                if (unlikely(fnhe))
+        }
-                        cached = rt_bind_exception(rt, fnhe, daddr);
-                else if (!(rt->dst.flags & DST_NOCACHE))
+        if (dst_mtu(dst) > IP_MAX_MTU)
-                        cached = rt_cache_route(nh, rt);
+                dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
-                if (unlikely(!cached)) {
+        if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
-                        /* Routes we intend to cache in nexthop exception or
+                dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
-                         * FIB nexthop have the DST_NOCACHE bit clear.
-                         * However, if we are unsuccessful at storing this
-                         * route into the cache we really need to set it.
-                         */
-                        rt->dst.flags |= DST_NOCACHE;
-                        if (!rt->rt_gateway)
-                                rt->rt_gateway = daddr;
-                        rt_add_uncached_list(rt);
-                }
-        } else
-                rt_add_uncached_list(rt);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-        set_class_tag(rt, res->tclassid);
+        set_class_tag(rt, fib_rules_tclass(res));
 #endif
        set_class_tag(rt, itag);
 #endif
 }
 static struct rtable *rt_dst_alloc(struct net_device *dev,
-                                   bool nopolicy, bool noxfrm, bool will_cache)
+                                   bool nopolicy, bool noxfrm)
 {
-        return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+        return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-                         (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+                         DST_HOST |
                         (nopolicy ? DST_NOPOLICY : 0) |
                         (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1322,7 +2017,9 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                                u8 tos, struct net_device *dev, int our)
 {
+        unsigned int hash;
        struct rtable *rth;
+        __be32 spec_dst;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        u32 itag = 0;
        int err;
@@ -1333,24 +2030,21 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                return -EINVAL;
        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
-            skb->protocol != htons(ETH_P_IP))
+            ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
                goto e_inval;
-        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
-                if (ipv4_is_loopback(saddr))
-                        goto e_inval;
        if (ipv4_is_zeronet(saddr)) {
                if (!ipv4_is_local_multicast(daddr))
                        goto e_inval;
+                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
        } else {
-                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
-                                          in_dev, &itag);
+                                          &itag);
                if (err < 0)
                        goto e_err;
        }
-        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+        rth = rt_dst_alloc(init_net.loopback_dev,
-                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
        if (!rth)
                goto e_nobufs;
@@ -1359,15 +2053,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
        rth->dst.output = ip_rt_bug;
+        rth->rt_key_dst = daddr;
+        rth->rt_key_src = saddr;
        rth->rt_genid   = rt_genid(dev_net(dev));
        rth->rt_flags   = RTCF_MULTICAST;
        rth->rt_type    = RTN_MULTICAST;
-        rth->rt_is_input= 1;
+        rth->rt_key_tos = tos;
-        rth->rt_iif     = 0;
+        rth->rt_dst     = daddr;
-        rth->rt_pmtu    = 0;
+        rth->rt_src     = saddr;
-        rth->rt_gateway = 0;
+        rth->rt_route_iif = dev->ifindex;
-        rth->rt_uses_gateway = 0;
+        rth->rt_iif     = dev->ifindex;
-        INIT_LIST_HEAD(&rth->rt_uncached);
+        rth->rt_oif     = 0;
+        rth->rt_mark    = skb->mark;
+        rth->rt_gateway = daddr;
+        rth->rt_spec_dst= spec_dst;
+        rth->rt_peer_genid = 0;
+        rth->peer = NULL;
+        rth->fi = NULL;
        if (our) {
                rth->dst.input= ip_local_deliver;
                rth->rt_flags |= RTCF_LOCAL;
@@ -1379,8 +2081,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
        RT_CACHE_STAT_INC(in_slow_mc);
-        skb_dst_set(skb, &rth->dst);
+        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-        return 0;
+        rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
+        return IS_ERR(rth) ? PTR_ERR(rth) : 0;
 e_nobufs:
        return -ENOBUFS;
@@ -1404,13 +2107,18 @@ static void ip_handle_martian_source(struct net_device *dev,
                 *      RFC1812 recommendation, if source is martian,
                 *      the only hint is MAC header.
                 */
-                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
+                printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
                        &daddr, &saddr, dev->name);
                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
-                        print_hex_dump(KERN_WARNING, "ll header: ",
+                        int i;
-                                       DUMP_PREFIX_OFFSET, 16, 1,
+                        const unsigned char *p = skb_mac_header(skb);
-                                       skb_mac_header(skb),
+                        printk(KERN_WARNING "ll header: ");
-                                       dev->hard_header_len, true);
+                        for (i = 0; i < dev->hard_header_len; i++, p++) {
+                                printk("%02x", *p);
+                                if (i < (dev->hard_header_len - 1))
+                                        printk(":");
+                        }
+                        printk("\n");
                }
        }
 #endif
@@ -1420,24 +2128,28 @@ static void ip_handle_martian_source(struct net_device *dev,
 static int __mkroute_input(struct sk_buff *skb,
                           const struct fib_result *res,
                           struct in_device *in_dev,
-                           __be32 daddr, __be32 saddr, u32 tos)
+                           __be32 daddr, __be32 saddr, u32 tos,
+                           struct rtable **result)
 {
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
        unsigned int flags = 0;
-        bool do_cache;
+        __be32 spec_dst;
        u32 itag;
        /* get a working reference to the output device */
        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
        if (out_dev == NULL) {
-                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
+                if (net_ratelimit())
+                        printk(KERN_CRIT "Bug in ip_route_input" \
+                               "_slow(). Please, report\n");
                return -EINVAL;
        }
        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
-                                  in_dev->dev, in_dev, &itag);
+                                  in_dev->dev, &spec_dst, &itag);
        if (err < 0) {
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                                         saddr);
@@ -1445,13 +2157,13 @@ static int __mkroute_input(struct sk_buff *skb,
                goto cleanup;
        }
-        do_cache = res->fi && !itag;
+        if (err)
-        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+                flags |= RTCF_DIRECTSRC;
+        if (out_dev == in_dev && err &&
            (IN_DEV_SHARED_MEDIA(out_dev) ||
-             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
+             inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
                flags |= RTCF_DOREDIRECT;
-                do_cache = false;
-        }
        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
@@ -1468,38 +2180,38 @@ static int __mkroute_input(struct sk_buff *skb,
                }
        }
-        if (do_cache) {
-                rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-                if (rt_cache_valid(rth)) {
-                        skb_dst_set_noref(skb, &rth->dst);
-                        goto out;
-                }
-        }
        rth = rt_dst_alloc(out_dev->dev,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
-                           IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+                           IN_DEV_CONF_GET(out_dev, NOXFRM));
        if (!rth) {
                err = -ENOBUFS;
                goto cleanup;
        }
+        rth->rt_key_dst = daddr;
+        rth->rt_key_src = saddr;
        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
        rth->rt_flags = flags;
        rth->rt_type = res->type;
-        rth->rt_is_input = 1;
+        rth->rt_key_tos = tos;
-        rth->rt_iif     = 0;
+        rth->rt_dst     = daddr;
-        rth->rt_pmtu    = 0;
+        rth->rt_src     = saddr;
-        rth->rt_gateway = 0;
+        rth->rt_route_iif = in_dev->dev->ifindex;
-        rth->rt_uses_gateway = 0;
+        rth->rt_iif     = in_dev->dev->ifindex;
-        INIT_LIST_HEAD(&rth->rt_uncached);
+        rth->rt_oif     = 0;
+        rth->rt_mark    = skb->mark;
+        rth->rt_gateway = daddr;
+        rth->rt_spec_dst= spec_dst;
+        rth->rt_peer_genid = 0;
+        rth->peer = NULL;
+        rth->fi = NULL;
        rth->dst.input = ip_forward;
        rth->dst.output = ip_output;
-        rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+        rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
-        skb_dst_set(skb, &rth->dst);
-out:
+        *result = rth;
        err = 0;
 cleanup:
        return err;
@@ -1511,13 +2223,27 @@ static int ip_mkroute_input(struct sk_buff *skb,
                            struct in_device *in_dev,
                            __be32 daddr, __be32 saddr, u32 tos)
 {
+        struct rtable* rth = NULL;
+        int err;
+        unsigned hash;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && res->fi->fib_nhs > 1)
                fib_select_multipath(res);
 #endif
        /* create a routing cache entry */
-        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+        err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
+        if (err)
+                return err;
+        /* put it into the cache */
+        hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
+                       rt_genid(dev_net(rth->dst.dev)));
+        rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
+        if (IS_ERR(rth))
+                return PTR_ERR(rth);
+        return 0;
 }
 /*
@@ -1537,12 +2263,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        struct fib_result res;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct flowi4   fl4;
-        unsigned int    flags = 0;
+        unsigned        flags = 0;
        u32             itag = 0;
-        struct rtable   *rth;
+        struct rtable * rth;
+        unsigned        hash;
+        __be32          spec_dst;
        int             err = -EINVAL;
-        struct net    *net = dev_net(dev);
+        struct net    * net = dev_net(dev);
-        bool do_cache;
        /* IP on this device is disabled. */
@@ -1553,10 +2280,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
           by fib_lookup.
         */
-        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+            ipv4_is_loopback(saddr))
                goto martian_source;
-        res.fi = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;
@@ -1566,20 +2293,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(saddr))
                goto martian_source;
-        if (ipv4_is_zeronet(daddr))
+        if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
                goto martian_destination;
-        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
-         * and call it once if daddr or/and saddr are loopback addresses
-         */
-        if (ipv4_is_loopback(daddr)) {
-                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
-                        goto martian_destination;
-        } else if (ipv4_is_loopback(saddr)) {
-                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
-                        goto martian_source;
-        }
        /*
         *      Now we are ready to route packet.
         */
@@ -1591,8 +2307,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        err = fib_lookup(net, &fl4, &res);
-        if (err != 0)
+        if (err != 0) {
+                if (!IN_DEV_FORWARD(in_dev))
+                        goto e_hostunreach;
                goto no_route;
+        }
        RT_CACHE_STAT_INC(in_slow_tot);
@@ -1601,15 +2320,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (res.type == RTN_LOCAL) {
                err = fib_validate_source(skb, saddr, daddr, tos,
-                                          LOOPBACK_IFINDEX,
+                                          net->loopback_dev->ifindex,
-                                          dev, in_dev, &itag);
+                                          dev, &spec_dst, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
+                if (err)
+                        flags |= RTCF_DIRECTSRC;
+                spec_dst = daddr;
                goto local_input;
        }
        if (!IN_DEV_FORWARD(in_dev))
-                goto no_route;
+                goto e_hostunreach;
        if (res.type != RTN_UNICAST)
                goto martian_destination;
@@ -1620,32 +2342,23 @@ brd_input:
        if (skb->protocol != htons(ETH_P_IP))
                goto e_inval;
-        if (!ipv4_is_zeronet(saddr)) {
+        if (ipv4_is_zeronet(saddr))
-                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+                spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
-                                          in_dev, &itag);
+        else {
+                err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
+                                          &itag);
                if (err < 0)
                        goto martian_source_keep_err;
+                if (err)
+                        flags |= RTCF_DIRECTSRC;
        }
        flags |= RTCF_BROADCAST;
        res.type = RTN_BROADCAST;
        RT_CACHE_STAT_INC(in_brd);
 local_input:
-        do_cache = false;
-        if (res.fi) {
-                if (!itag) {
-                        rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
-                        if (rt_cache_valid(rth)) {
-                                skb_dst_set_noref(skb, &rth->dst);
-                                err = 0;
-                                goto out;
-                        }
-                        do_cache = true;
-                }
-        }
        rth = rt_dst_alloc(net->loopback_dev,
-                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+                           IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
        if (!rth)
                goto e_nobufs;
@@ -1655,28 +2368,41 @@ local_input:
        rth->dst.tclassid = itag;
 #endif
+        rth->rt_key_dst = daddr;
+        rth->rt_key_src = saddr;
        rth->rt_genid = rt_genid(net);
        rth->rt_flags   = flags|RTCF_LOCAL;
        rth->rt_type    = res.type;
-        rth->rt_is_input = 1;
+        rth->rt_key_tos = tos;
-        rth->rt_iif     = 0;
+        rth->rt_dst     = daddr;
-        rth->rt_pmtu    = 0;
+        rth->rt_src     = saddr;
-        rth->rt_gateway = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
-        rth->rt_uses_gateway = 0;
+        rth->dst.tclassid = itag;
-        INIT_LIST_HEAD(&rth->rt_uncached);
+#endif
+        rth->rt_route_iif = dev->ifindex;
+        rth->rt_iif     = dev->ifindex;
+        rth->rt_oif     = 0;
+        rth->rt_mark    = skb->mark;
+        rth->rt_gateway = daddr;
+        rth->rt_spec_dst= spec_dst;
+        rth->rt_peer_genid = 0;
+        rth->peer = NULL;
+        rth->fi = NULL;
        if (res.type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags   &= ~RTCF_LOCAL;
        }
-        if (do_cache)
+        hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
-                rt_cache_route(&FIB_RES_NH(res), rth);
+        rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
-        skb_dst_set(skb, &rth->dst);
        err = 0;
+        if (IS_ERR(rth))
+                err = PTR_ERR(rth);
        goto out;
 no_route:
        RT_CACHE_STAT_INC(in_no_route);
+        spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
        res.type = RTN_UNREACHABLE;
        if (err == -ESRCH)
                err = -ENETUNREACH;
@@ -1688,11 +2414,15 @@ no_route:
 martian_destination:
        RT_CACHE_STAT_INC(in_martian_dst);
 #ifdef CONFIG_IP_ROUTE_VERBOSE
-        if (IN_DEV_LOG_MARTIANS(in_dev))
+        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+                printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
-                                     &daddr, &saddr, dev->name);
+                        &daddr, &saddr, dev->name);
 #endif
+e_hostunreach:
+        err = -EHOSTUNREACH;
+        goto out;
 e_inval:
        err = -EINVAL;
        goto out;
@@ -1708,13 +2438,50 @@ martian_source_keep_err:
        goto out;
 }
-int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-                         u8 tos, struct net_device *dev)
+                           u8 tos, struct net_device *dev, bool noref)
 {
+        struct rtable * rth;
+        unsigned        hash;
+        int iif = dev->ifindex;
+        struct net *net;
        int res;
+        net = dev_net(dev);
        rcu_read_lock();
+        if (!rt_caching(net))
+                goto skip_cache;
+        tos &= IPTOS_RT_MASK;
+        hash = rt_hash(daddr, saddr, iif, rt_genid(net));
+        for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+             rth = rcu_dereference(rth->dst.rt_next)) {
+                if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
+                     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
+                     (rth->rt_route_iif ^ iif) |
+                     (rth->rt_key_tos ^ tos)) == 0 &&
+                    rth->rt_mark == skb->mark &&
+                    net_eq(dev_net(rth->dst.dev), net) &&
+                    !rt_is_expired(rth)) {
+                        ipv4_validate_peer(rth);
+                        if (noref) {
+                                dst_use_noref(&rth->dst, jiffies);
+                                skb_dst_set_noref(skb, &rth->dst);
+                        } else {
+                                dst_use(&rth->dst, jiffies);
+                                skb_dst_set(skb, &rth->dst);
+                        }
+                        RT_CACHE_STAT_INC(in_hit);
+                        rcu_read_unlock();
+                        return 0;
+                }
+                RT_CACHE_STAT_INC(in_hlist_search);
+        }
+skip_cache:
        /* Multicast recognition logic is moved from route cache to here.
           The problem was that too many Ethernet cards have broken/missing
           hardware multicast filters :-( As result the host on multicasting
@@ -1752,29 +2519,24 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        rcu_read_unlock();
        return res;
 }
-EXPORT_SYMBOL(ip_route_input_noref);
+EXPORT_SYMBOL(ip_route_input_common);
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-                                       const struct flowi4 *fl4, int orig_oif,
+                                       const struct flowi4 *fl4,
+                                       __be32 orig_daddr, __be32 orig_saddr,
+                                       int orig_oif, __u8 orig_rtos,
                                       struct net_device *dev_out,
                                       unsigned int flags)
 {
        struct fib_info *fi = res->fi;
-        struct fib_nh_exception *fnhe;
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
-        bool do_cache;
-        in_dev = __in_dev_get_rcu(dev_out);
+        if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
-        if (!in_dev)
                return ERR_PTR(-EINVAL);
-        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
-                if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
-                        return ERR_PTR(-EINVAL);
        if (ipv4_is_lbcast(fl4->daddr))
                type = RTN_BROADCAST;
        else if (ipv4_is_multicast(fl4->daddr))
@@ -1785,7 +2547,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;
-        do_cache = true;
+        in_dev = __in_dev_get_rcu(dev_out);
+        if (!in_dev)
+                return ERR_PTR(-EINVAL);
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
                fi = NULL;
@@ -1794,8 +2559,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
                                     fl4->flowi4_proto))
                        flags &= ~RTCF_LOCAL;
-                else
-                        do_cache = false;
                /* If multicast route do not exist use
                 * default one, but do not gateway in this case.
                 * Yes, it is hack.
@@ -1804,57 +2567,40 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                        fi = NULL;
        }
-        fnhe = NULL;
-        do_cache &= fi != NULL;
-        if (do_cache) {
-                struct rtable __rcu **prth;
-                struct fib_nh *nh = &FIB_RES_NH(*res);
-                fnhe = find_exception(nh, fl4->daddr);
-                if (fnhe)
-                        prth = &fnhe->fnhe_rth;
-                else {
-                        if (unlikely(fl4->flowi4_flags &
-                                     FLOWI_FLAG_KNOWN_NH &&
-                                     !(nh->nh_gw &&
-                                       nh->nh_scope == RT_SCOPE_LINK))) {
-                                do_cache = false;
-                                goto add;
-                        }
-                        prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
-                }
-                rth = rcu_dereference(*prth);
-                if (rt_cache_valid(rth)) {
-                        dst_hold(&rth->dst);
-                        return rth;
-                }
-        }
-add:
        rth = rt_dst_alloc(dev_out,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
-                           IN_DEV_CONF_GET(in_dev, NOXFRM),
+                           IN_DEV_CONF_GET(in_dev, NOXFRM));
-                           do_cache);
        if (!rth)
                return ERR_PTR(-ENOBUFS);
        rth->dst.output = ip_output;
+        rth->rt_key_dst = orig_daddr;
+        rth->rt_key_src = orig_saddr;
        rth->rt_genid = rt_genid(dev_net(dev_out));
        rth->rt_flags   = flags;
        rth->rt_type    = type;
-        rth->rt_is_input = 0;
+        rth->rt_key_tos = orig_rtos;
-        rth->rt_iif     = orig_oif ? : 0;
+        rth->rt_dst     = fl4->daddr;
-        rth->rt_pmtu    = 0;
+        rth->rt_src     = fl4->saddr;
-        rth->rt_gateway = 0;
+        rth->rt_route_iif = 0;
-        rth->rt_uses_gateway = 0;
+        rth->rt_iif     = orig_oif ? : dev_out->ifindex;
-        INIT_LIST_HEAD(&rth->rt_uncached);
+        rth->rt_oif     = orig_oif;
+        rth->rt_mark    = fl4->flowi4_mark;
+        rth->rt_gateway = fl4->daddr;
+        rth->rt_spec_dst= fl4->saddr;
+        rth->rt_peer_genid = 0;
+        rth->peer = NULL;
+        rth->fi = NULL;
        RT_CACHE_STAT_INC(out_slow_tot);
-        if (flags & RTCF_LOCAL)
+        if (flags & RTCF_LOCAL) {
                rth->dst.input = ip_local_deliver;
+                rth->rt_spec_dst = fl4->daddr;
+        }
        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+                rth->rt_spec_dst = fl4->saddr;
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
                        rth->dst.output = ip_mc_output;
@@ -1871,31 +2617,37 @@ add:
 #endif
        }
-        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+        rt_set_nexthop(rth, fl4, res, fi, type, 0);
        return rth;
 }
 /*
 * Major route resolver routine.
+ * called with rcu_read_lock();
 */
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 {
        struct net_device *dev_out = NULL;
        __u8 tos = RT_FL_TOS(fl4);
        unsigned int flags = 0;
        struct fib_result res;
        struct rtable *rth;
+        __be32 orig_daddr;
+        __be32 orig_saddr;
        int orig_oif;
-        res.tclassid    = 0;
        res.fi          = NULL;
-        res.table       = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+        res.r           = NULL;
+#endif
+        orig_daddr = fl4->daddr;
+        orig_saddr = fl4->saddr;
        orig_oif = fl4->flowi4_oif;
-        fl4->flowi4_iif = LOOPBACK_IFINDEX;
+        fl4->flowi4_iif = net->loopback_dev->ifindex;
        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1984,7 +2736,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
-                fl4->flowi4_oif = LOOPBACK_IFINDEX;
+                fl4->flowi4_oif = net->loopback_dev->ifindex;
                res.type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
@@ -1992,7 +2744,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
        if (fib_lookup(net, fl4, &res)) {
                res.fi = NULL;
-                res.table = NULL;
                if (fl4->flowi4_oif) {
                        /* Apparently, routing tables are wrong. Assume,
                           that the destination is on link.
@@ -2031,6 +2782,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                }
                dev_out = net->loopback_dev;
                fl4->flowi4_oif = dev_out->ifindex;
+                res.fi = NULL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }
@@ -2053,33 +2805,73 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 make_route:
-        rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+        rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
+                               tos, dev_out, flags);
+        if (!IS_ERR(rth)) {
+                unsigned int hash;
+                hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
+                               rt_genid(dev_net(dev_out)));
+                rth = rt_intern_hash(hash, rth, NULL, orig_oif);
+        }
 out:
        rcu_read_unlock();
        return rth;
 }
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
-static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
 {
-        return NULL;
+        struct rtable *rth;
+        unsigned int hash;
+        if (!rt_caching(net))
+                goto slow_output;
+        hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
+        rcu_read_lock_bh();
+        for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
+                rth = rcu_dereference_bh(rth->dst.rt_next)) {
+                if (rth->rt_key_dst == flp4->daddr &&
+                    rth->rt_key_src == flp4->saddr &&
+                    rt_is_output_route(rth) &&
+                    rth->rt_oif == flp4->flowi4_oif &&
+                    rth->rt_mark == flp4->flowi4_mark &&
+                    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
+                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
+                    net_eq(dev_net(rth->dst.dev), net) &&
+                    !rt_is_expired(rth)) {
+                        ipv4_validate_peer(rth);
+                        dst_use(&rth->dst, jiffies);
+                        RT_CACHE_STAT_INC(out_hit);
+                        rcu_read_unlock_bh();
+                        if (!flp4->saddr)
+                                flp4->saddr = rth->rt_src;
+                        if (!flp4->daddr)
+                                flp4->daddr = rth->rt_dst;
+                        return rth;
+                }
+                RT_CACHE_STAT_INC(out_hlist_search);
+        }
+        rcu_read_unlock_bh();
+slow_output:
+        return ip_route_output_slow(net, flp4);
 }
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
-static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
 {
-        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+        return NULL;
-        return mtu ? : dst->dev->mtu;
 }
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
-                                          struct sk_buff *skb, u32 mtu)
 {
+        return 0;
 }
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
-                                       struct sk_buff *skb)
 {
 }
@@ -2092,43 +2884,53 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
 static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                 =       AF_INET,
        .protocol               =       cpu_to_be16(ETH_P_IP),
+        .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_blackhole_dst_check,
-        .mtu                    =       ipv4_blackhole_mtu,
+        .default_mtu            =       ipv4_blackhole_default_mtu,
        .default_advmss         =       ipv4_default_advmss,
        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-        .redirect               =       ipv4_rt_blackhole_redirect,
        .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
        .neigh_lookup           =       ipv4_neigh_lookup,
 };
 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
+        struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
        struct rtable *ort = (struct rtable *) dst_orig;
-        struct rtable *rt;
-        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard;
+                dst_copy_metrics(new, &ort->dst);
                new->dev = ort->dst.dev;
                if (new->dev)
                        dev_hold(new->dev);
-                rt->rt_is_input = ort->rt_is_input;
+                rt->rt_key_dst = ort->rt_key_dst;
+                rt->rt_key_src = ort->rt_key_src;
+                rt->rt_key_tos = ort->rt_key_tos;
+                rt->rt_route_iif = ort->rt_route_iif;
                rt->rt_iif = ort->rt_iif;
-                rt->rt_pmtu = ort->rt_pmtu;
+                rt->rt_oif = ort->rt_oif;
+                rt->rt_mark = ort->rt_mark;
                rt->rt_genid = rt_genid(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
+                rt->rt_dst = ort->rt_dst;
+                rt->rt_src = ort->rt_src;
                rt->rt_gateway = ort->rt_gateway;
-                rt->rt_uses_gateway = ort->rt_uses_gateway;
+                rt->rt_spec_dst = ort->rt_spec_dst;
+                rt->peer = ort->peer;
-                INIT_LIST_HEAD(&rt->rt_uncached);
+                if (rt->peer)
+                        atomic_inc(&rt->peer->refcnt);
+                rt->fi = ort->fi;
+                if (rt->fi)
+                        atomic_inc(&rt->fi->fib_clntref);
                dst_free(new);
        }
@@ -2155,18 +2957,18 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
+static int rt_fill_info(struct net *net,
-                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+                        struct sk_buff *skb, u32 pid, u32 seq, int event,
-                        u32 seq, int event, int nowait, unsigned int flags)
+                        int nowait, unsigned int flags)
 {
        struct rtable *rt = skb_rtable(skb);
        struct rtmsg *r;
        struct nlmsghdr *nlh;
-        unsigned long expires = 0;
+        long expires = 0;
-        u32 error;
+        const struct inet_peer *peer = rt->peer;
-        u32 metrics[RTAX_MAX];
+        u32 id = 0, ts = 0, tsage = 0, error;
-        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+        nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
@@ -2174,10 +2976,9 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
        r->rtm_family    = AF_INET;
        r->rtm_dst_len  = 32;
        r->rtm_src_len  = 0;
-        r->rtm_tos      = fl4->flowi4_tos;
+        r->rtm_tos      = rt->rt_key_tos;
        r->rtm_table    = RT_TABLE_MAIN;
-        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+        NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
-                goto nla_put_failure;
        r->rtm_type     = rt->rt_type;
        r->rtm_scope    = RT_SCOPE_UNIVERSE;
        r->rtm_protocol = RTPROT_UNSPEC;
@@ -2185,58 +2986,53 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
        if (rt->rt_flags & RTCF_NOTIFY)
                r->rtm_flags |= RTM_F_NOTIFY;
-        if (nla_put_be32(skb, RTA_DST, dst))
+        NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
-                goto nla_put_failure;
-        if (src) {
+        if (rt->rt_key_src) {
                r->rtm_src_len = 32;
-                if (nla_put_be32(skb, RTA_SRC, src))
+                NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
-                        goto nla_put_failure;
        }
-        if (rt->dst.dev &&
+        if (rt->dst.dev)
-            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+                NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
-                goto nla_put_failure;
 #ifdef CONFIG_IP_ROUTE_CLASSID
-        if (rt->dst.tclassid &&
+        if (rt->dst.tclassid)
-            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+                NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
-                goto nla_put_failure;
 #endif
-        if (!rt_is_input_route(rt) &&
+        if (rt_is_input_route(rt))
-            fl4->saddr != src) {
+                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
-                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
+        else if (rt->rt_src != rt->rt_key_src)
-                        goto nla_put_failure;
+                NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
-        }
-        if (rt->rt_uses_gateway &&
-            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
-                goto nla_put_failure;
-        expires = rt->dst.expires;
-        if (expires) {
-                unsigned long now = jiffies;
-                if (time_before(now, expires))
+        if (rt->rt_dst != rt->rt_gateway)
-                        expires -= now;
+                NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
-                else
-                        expires = 0;
-        }
-        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
-        if (rt->rt_pmtu && expires)
-                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
-        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
-        if (fl4->flowi4_mark &&
+        if (rt->rt_mark)
-            nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+                NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
-                goto nla_put_failure;
        error = rt->dst.error;
+        if (peer) {
+                inet_peer_refcheck(rt->peer);
+                id = atomic_read(&peer->ip_id_count) & 0xffff;
+                if (peer->tcp_ts_stamp) {
+                        ts = peer->tcp_ts;
+                        tsage = get_seconds() - peer->tcp_ts_stamp;
+                }
+                expires = ACCESS_ONCE(peer->pmtu_expires);
+                if (expires)
+                        expires -= jiffies;
+        }
        if (rt_is_input_route(rt)) {
 #ifdef CONFIG_IP_MROUTE
+                __be32 dst = rt->rt_dst;
                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
                        int err = ipmr_get_route(net, skb,
-                                                 fl4->saddr, fl4->daddr,
+                                                 rt->rt_src, rt->rt_dst,
                                                 r, nowait);
                        if (err <= 0) {
                                if (!nowait) {
@@ -2251,11 +3047,11 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
                        }
                } else
 #endif
-                        if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+                        NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
-                                goto nla_put_failure;
        }
-        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
+        if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
+                               expires, error) < 0)
                goto nla_put_failure;
        return nlmsg_end(skb, nlh);
@@ -2265,13 +3061,12 @@ nla_put_failure:
        return -EMSGSIZE;
 }
-static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 {
        struct net *net = sock_net(in_skb->sk);
        struct rtmsg *rtm;
        struct nlattr *tb[RTA_MAX+1];
        struct rtable *rt = NULL;
-        struct flowi4 fl4;
        __be32 dst = 0;
        __be32 src = 0;
        u32 iif;
@@ -2306,13 +3101,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
-        memset(&fl4, 0, sizeof(fl4));
-        fl4.daddr = dst;
-        fl4.saddr = src;
-        fl4.flowi4_tos = rtm->rtm_tos;
-        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
-        fl4.flowi4_mark = mark;
        if (iif) {
                struct net_device *dev;
@@ -2333,6 +3121,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
+                struct flowi4 fl4 = {
+                        .daddr = dst,
+                        .saddr = src,
+                        .flowi4_tos = rtm->rtm_tos,
+                        .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+                        .flowi4_mark = mark,
+                };
                rt = ip_route_output_key(net, &fl4);
                err = 0;
@@ -2347,13 +3142,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;
-        err = rt_fill_info(net, dst, src, &fl4, skb,
+        err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
-                           NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err <= 0)
                goto errout_free;
-        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
 errout:
        return err;
@@ -2364,12 +3158,49 @@ errout_free:
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
+        struct rtable *rt;
+        int h, s_h;
+        int idx, s_idx;
+        struct net *net;
+        net = sock_net(skb->sk);
+        s_h = cb->args[0];
+        if (s_h < 0)
+                s_h = 0;
+        s_idx = idx = cb->args[1];
+        for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
+                if (!rt_hash_table[h].chain)
+                        continue;
+                rcu_read_lock_bh();
+                for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
+                     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
+                        if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
+                                continue;
+                        if (rt_is_expired(rt))
+                                continue;
+                        skb_dst_set_noref(skb, &rt->dst);
+                        if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
+                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+                                         1, NLM_F_MULTI) <= 0) {
+                                skb_dst_drop(skb);
+                                rcu_read_unlock_bh();
+                                goto done;
+                        }
+                        skb_dst_drop(skb);
+                }
+                rcu_read_unlock_bh();
+        }
+done:
+        cb->args[0] = h;
+        cb->args[1] = idx;
        return skb->len;
 }
 void ip_rt_multicast_event(struct in_device *in_dev)
 {
-        rt_cache_flush(dev_net(in_dev->dev));
+        rt_cache_flush(dev_net(in_dev->dev), 0);
 }
 #ifdef CONFIG_SYSCTL
@@ -2378,7 +3209,16 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
                                        size_t *lenp, loff_t *ppos)
 {
        if (write) {
-                rt_cache_flush((struct net *)__ctl->extra1);
+                int flush_delay;
+                ctl_table ctl;
+                struct net *net;
+                memcpy(&ctl, __ctl, sizeof(ctl));
+                ctl.data = &flush_delay;
+                proc_dointvec(&ctl, write, buffer, lenp, ppos);
+                net = (struct net *)__ctl->extra1;
+                rt_cache_flush(net, flush_delay);
                return 0;
        }
@@ -2431,6 +3271,13 @@ static ctl_table ipv4_route_table[] = {
                .proc_handler   = proc_dointvec_jiffies,
        },
        {
+                .procname       = "gc_interval",
+                .data           = &ip_rt_gc_interval,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_jiffies,
+        },
+        {
                .procname       = "redirect_load",
                .data           = &ip_rt_redirect_load,
                .maxlen         = sizeof(int),
@@ -2496,6 +3343,23 @@ static ctl_table ipv4_route_table[] = {
        { }
 };
+static struct ctl_table empty[1];
+static struct ctl_table ipv4_skeleton[] =
+{
+        { .procname = "route", 
+          .mode = 0555, .child = ipv4_route_table},
+        { .procname = "neigh", 
+          .mode = 0555, .child = empty},
+        { }
+};
+static __net_initdata struct ctl_path ipv4_path[] = {
+        { .procname = "net", },
+        { .procname = "ipv4", },
+        { },
+};
 static struct ctl_table ipv4_route_flush_table[] = {
        {
                .procname       = "flush",
@@ -2506,6 +3370,13 @@ static struct ctl_table ipv4_route_flush_table[] = {
        { },
 };
+static __net_initdata struct ctl_path ipv4_route_path[] = {
+        { .procname = "net", },
+        { .procname = "ipv4", },
+        { .procname = "route", },
+        { },
+};
 static __net_init int sysctl_route_net_init(struct net *net)
 {
        struct ctl_table *tbl;
@@ -2515,14 +3386,11 @@ static __net_init int sysctl_route_net_init(struct net *net)
                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
                if (tbl == NULL)
                        goto err_dup;
-                /* Don't export sysctls to unprivileged users */
-                if (net->user_ns != &init_user_ns)
-                        tbl[0].procname = NULL;
        }
        tbl[0].extra1 = net;
-        net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
+        net->ipv4.route_hdr =
+                register_net_sysctl_table(net, ipv4_route_path, tbl);
        if (net->ipv4.route_hdr == NULL)
                goto err_reg;
        return 0;
@@ -2552,7 +3420,8 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 static __net_init int rt_genid_init(struct net *net)
 {
-        atomic_set(&net->rt_genid, 0);
+        get_random_bytes(&net->ipv4.rt_genid,
+                         sizeof(net->ipv4.rt_genid));
        get_random_bytes(&net->ipv4.dev_addr_genid,
                         sizeof(net->ipv4.dev_addr_genid));
        return 0;
@@ -2562,35 +3431,21 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
        .init = rt_genid_init,
 };
-static int __net_init ipv4_inetpeer_init(struct net *net)
-{
-        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
-        if (!bp)
-                return -ENOMEM;
-        inet_peer_base_init(bp);
-        net->ipv4.peers = bp;
-        return 0;
-}
-static void __net_exit ipv4_inetpeer_exit(struct net *net)
-{
-        struct inet_peer_base *bp = net->ipv4.peers;
-        net->ipv4.peers = NULL;
-        inetpeer_invalidate_tree(bp);
-        kfree(bp);
-}
-static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
-        .init   =       ipv4_inetpeer_init,
-        .exit   =       ipv4_inetpeer_exit,
-};
 #ifdef CONFIG_IP_ROUTE_CLASSID
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_IP_ROUTE_CLASSID */
+static __initdata unsigned long rhash_entries;
+static int __init set_rhash_entries(char *str)
+{
+        if (!str)
+                return 0;
+        rhash_entries = simple_strtoul(str, &str, 0);
+        return 1;
+}
+__setup("rhash_entries=", set_rhash_entries);
 int __init ip_rt_init(void)
 {
        int rc = 0;
@@ -2613,17 +3468,35 @@ int __init ip_rt_init(void)
        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
-        ipv4_dst_ops.gc_thresh = ~0;
+        rt_hash_table = (struct rt_hash_bucket *)
-        ip_rt_max_size = INT_MAX;
+                alloc_large_system_hash("IP route cache",
+                                        sizeof(struct rt_hash_bucket),
+                                        rhash_entries,
+                                        (totalram_pages >= 128 * 1024) ?
+                                        15 : 17,
+                                        0,
+                                        &rt_hash_log,
+                                        &rt_hash_mask,
+                                        rhash_entries ? 0 : 512 * 1024);
+        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+        rt_hash_lock_init();
+        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
+        ip_rt_max_size = (rt_hash_mask + 1) * 16;
        devinet_init();
        ip_fib_init();
+        INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+        expires_ljiffies = jiffies;
+        schedule_delayed_work(&expires_work,
+                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
        if (ip_rt_proc_init())
-                pr_err("Unable to create route proc files\n");
+                printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
        xfrm_init();
-        xfrm4_init();
+        xfrm4_init(ip_rt_max_size);
 #endif
        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
@@ -2631,7 +3504,6 @@ int __init ip_rt_init(void)
        register_pernet_subsys(&sysctl_route_ops);
 #endif
        register_pernet_subsys(&rt_genid_ops);
-        register_pernet_subsys(&ipv4_inetpeer_ops);
        return rc;
 }
@@ -2642,6 +3514,6 @@ int __init ip_rt_init(void)
 */
 void __init ip_static_sysctl_init(void)
 {
-        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
+        register_sysctl_paths(ipv4_path, ipv4_skeleton);
 }
 #endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b236ef04914..3bc5c8f7c71 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -15,7 +15,6 @@
 #include <linux/random.h>
 #include <linux/cryptohash.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
 #include <net/tcp.h>
 #include <net/route.h>
@@ -245,7 +244,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
        if (!sysctl_tcp_timestamps)
                return false;
-        tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
+        tcp_opt->sack_ok = (options >> 4) & 0x1;
        *ecn_ok = (options >> 5) & 1;
        if (*ecn_ok && !sysctl_tcp_ecn)
                return false;
@@ -266,7 +265,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
                             struct ip_options *opt)
 {
        struct tcp_options_received tcp_opt;
-        const u8 *hash_location;
+        u8 *hash_location;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct tcp_sock *tp = tcp_sk(sk);
@@ -278,7 +277,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        struct rtable *rt;
        __u8 rcv_wscale;
        bool ecn_ok = false;
-        struct flowi4 fl4;
        if (!sysctl_tcp_syncookies || !th->ack || th->rst)
                goto out;
@@ -293,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        /* check for timestamp cookie support */
        memset(&tcp_opt, 0, sizeof(tcp_opt));
-        tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
+        tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
        if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
                goto out;
@@ -319,7 +317,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        ireq->tstamp_ok         = tcp_opt.saw_tstamp;
        req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
        treq->snt_synack        = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
-        treq->listener          = NULL;
        /* We throwed the options of the initial SYN away, so we hope
         * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -340,7 +337,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        }
        req->expires    = 0UL;
-        req->num_retrans = 0;
+        req->retrans    = 0;
        /*
         * We need to lookup the route here to get at the correct
@@ -348,16 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
         * hasn't changed since we received the original syn, but I see
         * no easy way to do this.
         */
-        flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
+        {
-                           RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+                struct flowi4 fl4;
-                           inet_sk_flowi_flags(sk),
-                           (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
+                flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
-                           ireq->loc_addr, th->source, th->dest);
+                                   RT_SCOPE_UNIVERSE, IPPROTO_TCP,
-        security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+                                   inet_sk_flowi_flags(sk),
-        rt = ip_route_output_key(sock_net(sk), &fl4);
+                                   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
-        if (IS_ERR(rt)) {
+                                   ireq->loc_addr, th->source, th->dest);
-                reqsk_free(req);
+                security_req_classify_flow(req, flowi4_to_flowi(&fl4));
-                goto out;
+                rt = ip_route_output_key(sock_net(sk), &fl4);
+                if (IS_ERR(rt)) {
+                        reqsk_free(req);
+                        goto out;
+                }
        }
        /* Try to redo what tcp_v4_send_synack did. */
@@ -371,10 +372,5 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        ireq->rcv_wscale  = rcv_wscale;
        ret = get_cookie_sock(sk, skb, req, &rt->dst);
-        /* ip_queue_xmit() depends on our flow being setup
-         * Normal sockets get it right from inet_csk_route_child_sock()
-         */
-        if (ret)
-                inet_sk(ret)->cork.fl.u.ip4 = fl4;
 out:    return ret;
 }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b6504..69fd7201129 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
-#include <linux/swap.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -24,10 +23,8 @@
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
 #include <net/ping.h>
-#include <net/tcp_memcontrol.h>
 static int zero;
-static int two = 2;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -76,10 +73,10 @@ static int ipv4_local_port_range(ctl_table *table, int write,
 }
-static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
 {
-        kgid_t *data = table->data;
+        gid_t *data = table->data;
-        unsigned int seq;
+        unsigned seq;
        do {
                seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -89,12 +86,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low
 }
 /* Update system visible IP port range */
-static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+static void set_ping_group_range(struct ctl_table *table, int range[2])
 {
-        kgid_t *data = table->data;
+        gid_t *data = table->data;
        write_seqlock(&sysctl_local_ports.lock);
-        data[0] = low;
+        data[0] = range[0];
-        data[1] = high;
+        data[1] = range[1];
        write_sequnlock(&sysctl_local_ports.lock);
 }
@@ -103,33 +100,21 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
                                 void __user *buffer,
                                 size_t *lenp, loff_t *ppos)
 {
-        struct user_namespace *user_ns = current_user_ns();
        int ret;
-        gid_t urange[2];
+        gid_t range[2];
-        kgid_t low, high;
        ctl_table tmp = {
-                .data = &urange,
+                .data = &range,
-                .maxlen = sizeof(urange),
+                .maxlen = sizeof(range),
                .mode = table->mode,
                .extra1 = &ip_ping_group_range_min,
                .extra2 = &ip_ping_group_range_max,
        };
-        inet_get_ping_group_range_table(table, &low, &high);
+        inet_get_ping_group_range_table(table, range, range + 1);
-        urange[0] = from_kgid_munged(user_ns, low);
-        urange[1] = from_kgid_munged(user_ns, high);
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
-        if (write && ret == 0) {
+        if (write && ret == 0)
-                low = make_kgid(user_ns, urange[0]);
+                set_ping_group_range(table, range);
-                high = make_kgid(user_ns, urange[1]);
-                if (!gid_valid(low) || !gid_valid(high) ||
-                    (urange[1] < urange[0]) || gid_lt(high, low)) {
-                        low = make_kgid(&init_user_ns, 1);
-                        high = make_kgid(&init_user_ns, 0);
-                }
-                set_ping_group_range(table, low, high);
-        }
        return ret;
 }
@@ -189,90 +174,6 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
        return ret;
 }
-static int ipv4_tcp_mem(ctl_table *ctl, int write,
-                           void __user *buffer, size_t *lenp,
-                           loff_t *ppos)
-{
-        int ret;
-        unsigned long vec[3];
-        struct net *net = current->nsproxy->net_ns;
-#ifdef CONFIG_MEMCG_KMEM
-        struct mem_cgroup *memcg;
-#endif
-        ctl_table tmp = {
-                .data = &vec,
-                .maxlen = sizeof(vec),
-                .mode = ctl->mode,
-        };
-        if (!write) {
-                ctl->data = &net->ipv4.sysctl_tcp_mem;
-                return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
-        }
-        ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
-        if (ret)
-                return ret;
-#ifdef CONFIG_MEMCG_KMEM
-        rcu_read_lock();
-        memcg = mem_cgroup_from_task(current);
-        tcp_prot_mem(memcg, vec[0], 0);
-        tcp_prot_mem(memcg, vec[1], 1);
-        tcp_prot_mem(memcg, vec[2], 2);
-        rcu_read_unlock();
-#endif
-        net->ipv4.sysctl_tcp_mem[0] = vec[0];
-        net->ipv4.sysctl_tcp_mem[1] = vec[1];
-        net->ipv4.sysctl_tcp_mem[2] = vec[2];
-        return 0;
-}
-int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
-                          size_t *lenp, loff_t *ppos)
-{
-        ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
-        struct tcp_fastopen_context *ctxt;
-        int ret;
-        u32  user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
-        tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
-        if (!tbl.data)
-                return -ENOMEM;
-        rcu_read_lock();
-        ctxt = rcu_dereference(tcp_fastopen_ctx);
-        if (ctxt)
-                memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
-        else
-                memset(user_key, 0, sizeof(user_key));
-        rcu_read_unlock();
-        snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
-                user_key[0], user_key[1], user_key[2], user_key[3]);
-        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
-        if (write && ret == 0) {
-                if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
-                           user_key + 2, user_key + 3) != 4) {
-                        ret = -EINVAL;
-                        goto bad_key;
-                }
-                tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
-        }
-bad_key:
-        pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
-               user_key[0], user_key[1], user_key[2], user_key[3],
-               (char *)tbl.data, ret);
-        kfree(tbl.data);
-        return ret;
-}
 static struct ctl_table ipv4_table[] = {
        {
                .procname       = "tcp_timestamps",
@@ -354,13 +255,6 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec
        },
        {
-                .procname       = "ip_early_demux",
-                .data           = &sysctl_ip_early_demux,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec
-        },
-        {
                .procname       = "ip_dynaddr",
                .data           = &sysctl_ip_dynaddr,
                .maxlen         = sizeof(int),
@@ -420,19 +314,6 @@ static struct ctl_table ipv4_table[] = {
        },
 #endif
        {
-                .procname       = "tcp_fastopen",
-                .data           = &sysctl_tcp_fastopen,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
-                .procname       = "tcp_fastopen_key",
-                .mode           = 0600,
-                .maxlen         = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
-                .proc_handler   = proc_tcp_fastopen_key,
-        },
-        {
                .procname       = "tcp_tw_recycle",
                .data           = &tcp_death_row.sysctl_tw_recycle,
                .maxlen         = sizeof(int),
@@ -552,6 +433,13 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec
        },
        {
+                .procname       = "tcp_mem",
+                .data           = &sysctl_tcp_mem,
+                .maxlen         = sizeof(sysctl_tcp_mem),
+                .mode           = 0644,
+                .proc_handler   = proc_doulongvec_minmax
+        },
+        {
                .procname       = "tcp_wmem",
                .data           = &sysctl_tcp_wmem,
                .maxlen         = sizeof(sysctl_tcp_wmem),
@@ -664,20 +552,6 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
-        {
-                .procname       = "tcp_limit_output_bytes",
-                .data           = &sysctl_tcp_limit_output_bytes,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec
-        },
-        {
-                .procname       = "tcp_challenge_ack_limit",
-                .data           = &sysctl_tcp_challenge_ack_limit,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec
-        },
 #ifdef CONFIG_NET_DMA
        {
                .procname       = "tcp_dma_copybreak",
@@ -765,15 +639,6 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec
        },
        {
-                .procname       = "tcp_early_retrans",
-                .data           = &sysctl_tcp_early_retrans,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-                .extra2         = &two,
-        },
-        {
                .procname       = "udp_mem",
                .data           = &sysctl_udp_mem,
                .maxlen         = sizeof(sysctl_udp_mem),
@@ -843,21 +708,29 @@ static struct ctl_table ipv4_net_table[] = {
                .proc_handler   = proc_dointvec
        },
        {
-                .procname       = "ping_group_range",
+                .procname       = "rt_cache_rebuild_count",
-                .data           = &init_net.ipv4.sysctl_ping_group_range,
+                .data           = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
-                .maxlen         = sizeof(gid_t)*2,
+                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = ipv4_ping_group_range,
+                .proc_handler   = proc_dointvec
        },
        {
-                .procname       = "tcp_mem",
+                .procname       = "ping_group_range",
-                .maxlen         = sizeof(init_net.ipv4.sysctl_tcp_mem),
+                .data           = &init_net.ipv4.sysctl_ping_group_range,
+                .maxlen         = sizeof(init_net.ipv4.sysctl_ping_group_range),
                .mode           = 0644,
-                .proc_handler   = ipv4_tcp_mem,
+                .proc_handler   = ipv4_ping_group_range,
        },
        { }
 };
+struct ctl_path net_ipv4_ctl_path[] = {
+        { .procname = "net", },
+        { .procname = "ipv4", },
+        { },
+};
+EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
 static __net_init int ipv4_sysctl_init_net(struct net *net)
 {
        struct ctl_table *table;
@@ -881,23 +754,23 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
                table[5].data =
                        &net->ipv4.sysctl_icmp_ratemask;
                table[6].data =
+                        &net->ipv4.sysctl_rt_cache_rebuild_count;
+                table[7].data =
                        &net->ipv4.sysctl_ping_group_range;
-                /* Don't export sysctls to unprivileged users */
-                if (net->user_ns != &init_user_ns)
-                        table[0].procname = NULL;
        }
        /*
         * Sane defaults - nobody may create ping sockets.
         * Boot scripts should set this to distro-specific group.
         */
-        net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
+        net->ipv4.sysctl_ping_group_range[0] = 1;
-        net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
+        net->ipv4.sysctl_ping_group_range[1] = 0;
-        tcp_init_mem(net);
+        net->ipv4.sysctl_rt_cache_rebuild_count = 4;
-        net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
+        net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
+                        net_ipv4_ctl_path, table);
        if (net->ipv4.ipv4_hdr == NULL)
                goto err_reg;
@@ -938,12 +811,12 @@ static __init int sysctl_ipv4_init(void)
        if (!i->procname)
                return -EINVAL;
-        hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
+        hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
        if (hdr == NULL)
                return -ENOMEM;
        if (register_pernet_subsys(&ipv4_sysctl_ops)) {
-                unregister_net_sysctl_table(hdr);
+                unregister_sysctl_table(hdr);
                return -ENOMEM;
        }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2aa69c8ae60..09ced58e6a5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -245,8 +245,6 @@
 *      TCP_CLOSE               socket is finished
 */
-#define pr_fmt(fmt) "TCP: " fmt
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -268,12 +266,15 @@
 #include <linux/crypto.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/uid_stat.h>
 #include <net/icmp.h>
-#include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
 #include <net/netdma.h>
 #include <net/sock.h>
@@ -285,9 +286,11 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
+long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
@@ -364,72 +367,6 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
        return period;
 }
-/* Address-family independent initialization for a tcp_sock.
- *
- * NOTE: A lot of things set to zero explicitly by call to
- *       sk_alloc() so need not be done here.
- */
-void tcp_init_sock(struct sock *sk)
-{
-        struct inet_connection_sock *icsk = inet_csk(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
-        skb_queue_head_init(&tp->out_of_order_queue);
-        tcp_init_xmit_timers(sk);
-        tcp_prequeue_init(tp);
-        INIT_LIST_HEAD(&tp->tsq_node);
-        icsk->icsk_rto = TCP_TIMEOUT_INIT;
-        tp->mdev = TCP_TIMEOUT_INIT;
-        /* So many TCP implementations out there (incorrectly) count the
-         * initial SYN frame in their delayed-ACK and congestion control
-         * algorithms that we must have the following bandaid to talk
-         * efficiently to them.  -DaveM
-         */
-        tp->snd_cwnd = TCP_INIT_CWND;
-        /* See draft-stevens-tcpca-spec-01 for discussion of the
-         * initialization of these values.
-         */
-        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-        tp->snd_cwnd_clamp = ~0;
-        tp->mss_cache = TCP_MSS_DEFAULT;
-        tp->reordering = sysctl_tcp_reordering;
-        tcp_enable_early_retrans(tp);
-        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
-        sk->sk_state = TCP_CLOSE;
-        sk->sk_write_space = sk_stream_write_space;
-        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-        icsk->icsk_sync_mss = tcp_sync_mss;
-        /* TCP Cookie Transactions */
-        if (sysctl_tcp_cookie_size > 0) {
-                /* Default, cookies without s_data_payload. */
-                tp->cookie_values =
-                        kzalloc(sizeof(*tp->cookie_values),
-                                sk->sk_allocation);
-                if (tp->cookie_values != NULL)
-                        kref_init(&tp->cookie_values->kref);
-        }
-        /* Presumed zeroed, in order of appearance:
-         *      cookie_in_always, cookie_out_never,
-         *      s_data_constant, s_data_in, s_data_out
-         */
-        sk->sk_sndbuf = sysctl_tcp_wmem[1];
-        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
-        local_bh_disable();
-        sock_update_memcg(sk);
-        sk_sockets_allocated_inc(sk);
-        local_bh_enable();
-}
-EXPORT_SYMBOL(tcp_init_sock);
 /*
 *      Wait for a TCP event.
 *
@@ -441,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
        unsigned int mask;
        struct sock *sk = sock->sk;
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        sock_poll_wait(file, sk_sleep(sk), wait);
        if (sk->sk_state == TCP_LISTEN)
@@ -486,9 +423,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= POLLIN | POLLRDNORM | POLLRDHUP;
-        /* Connected or passive Fast Open socket? */
+        /* Connected? */
-        if (sk->sk_state != TCP_SYN_SENT &&
+        if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-            (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
                int target = sock_rcvlowat(sk, 0, INT_MAX);
                if (tp->urg_seq == tp->copied_seq &&
@@ -536,29 +472,30 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int answ;
-        bool slow;
        switch (cmd) {
        case SIOCINQ:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;
-                slow = lock_sock_fast(sk);
+                lock_sock(sk);
                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                        answ = 0;
                else if (sock_flag(sk, SOCK_URGINLINE) ||
                         !tp->urg_data ||
                         before(tp->urg_seq, tp->copied_seq) ||
                         !before(tp->urg_seq, tp->rcv_nxt)) {
+                        struct sk_buff *skb;
                        answ = tp->rcv_nxt - tp->copied_seq;
-                        /* Subtract 1, if FIN was received */
+                        /* Subtract 1, if FIN is in queue. */
-                        if (answ && sock_flag(sk, SOCK_DONE))
+                        skb = skb_peek_tail(&sk->sk_receive_queue);
-                                answ--;
+                        if (answ && skb)
+                                answ -= tcp_hdr(skb)->fin;
                } else
                        answ = tp->urg_seq - tp->copied_seq;
-                unlock_sock_fast(sk, slow);
+                release_sock(sk);
                break;
        case SIOCATMARK:
                answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
@@ -591,11 +528,11 @@ EXPORT_SYMBOL(tcp_ioctl);
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
-        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+        TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
        tp->pushed_seq = tp->write_seq;
 }
-static inline bool forced_push(const struct tcp_sock *tp)
+static inline int forced_push(struct tcp_sock *tp)
 {
        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 }
@@ -607,7 +544,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
        skb->csum    = 0;
        tcb->seq     = tcb->end_seq = tp->write_seq;
-        tcb->tcp_flags = TCPHDR_ACK;
+        tcb->flags   = TCPHDR_ACK;
        tcb->sacked  = 0;
        skb_header_release(skb);
        tcp_add_write_queue_tail(sk, skb);
@@ -768,12 +705,11 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
        skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
        if (skb) {
                if (sk_wmem_schedule(sk, skb->truesize)) {
-                        skb_reserve(skb, sk->sk_prot->max_header);
                        /*
                         * Make sure that we have exactly size bytes
                         * available to the caller, no more, no less.
                         */
-                        skb->avail_size = size;
+                        skb_reserve(skb, skb_tailroom(skb) - size);
                        return skb;
                }
                __kfree_skb(skb);
@@ -798,10 +734,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                                  inet_csk(sk)->icsk_ext_hdr_len -
                                  tp->tcp_header_len);
-                /* TSQ : try to have two TSO segments in flight */
-                xmit_size_goal = min_t(u32, xmit_size_goal,
-                                       sysctl_tcp_limit_output_bytes >> 1);
                xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
                /* We try hard to avoid divides here */
@@ -811,9 +743,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                           old_size_goal + mss_now > xmit_size_goal)) {
                        xmit_size_goal = old_size_goal;
                } else {
-                        tp->xmit_size_goal_segs =
+                        tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
-                                min_t(u16, xmit_size_goal / mss_now,
-                                      sk->sk_gso_max_segs);
                        xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
                }
        }
@@ -831,8 +761,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
        return mss_now;
 }
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
-                                size_t size, int flags)
+                         size_t psize, int flags)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int mss_now, size_goal;
@@ -840,15 +770,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
        ssize_t copied;
        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-        /* Wait for a connection to finish. One exception is TCP Fast Open
+        /* Wait for a connection to finish. */
-         * (passive side) where data is allowed to be sent before a connection
+        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
-         * is fully established.
-         */
-        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
-            !tcp_passive_fastopen(sk)) {
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto out_err;
-        }
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -859,10 +784,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto out_err;
-        while (size > 0) {
+        while (psize > 0) {
                struct sk_buff *skb = tcp_write_queue_tail(sk);
-                int copy, i;
+                struct page *page = pages[poffset / PAGE_SIZE];
-                bool can_coalesce;
+                int copy, i, can_coalesce;
+                int offset = poffset % PAGE_SIZE;
+                int size = min_t(size_t, psize, PAGE_SIZE - offset);
                if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
 new_segment:
@@ -890,7 +817,7 @@ new_segment:
                        goto wait_for_memory;
                if (can_coalesce) {
-                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+                        skb_shinfo(skb)->frags[i - 1].size += copy;
                } else {
                        get_page(page);
                        skb_fill_page_desc(skb, i, page, offset, copy);
@@ -907,11 +834,11 @@ new_segment:
                skb_shinfo(skb)->gso_segs = 0;
                if (!copied)
-                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+                        TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
                copied += copy;
-                offset += copy;
+                poffset += copy;
-                if (!(size -= copy))
+                if (!(psize -= copy))
                        goto out;
                if (skb->len < size_goal || (flags & MSG_OOB))
@@ -927,7 +854,8 @@ new_segment:
 wait_for_sndbuf:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-                tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+                if (copied)
+                        tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
                if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
                        goto do_error;
@@ -936,7 +864,7 @@ wait_for_memory:
        }
 out:
-        if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
+        if (copied)
                tcp_push(sk, flags, mss_now, tp->nonagle);
        return copied;
@@ -958,24 +886,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
                                        flags);
        lock_sock(sk);
-        res = do_tcp_sendpages(sk, page, offset, size, flags);
+        res = do_tcp_sendpages(sk, &page, offset, size, flags);
        release_sock(sk);
        return res;
 }
 EXPORT_SYMBOL(tcp_sendpage);
-static inline int select_size(const struct sock *sk, bool sg)
+#define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
+#define TCP_OFF(sk)     (sk->sk_sndmsg_off)
+static inline int select_size(struct sock *sk, int sg)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        int tmp = tp->mss_cache;
        if (sg) {
-                if (sk_can_gso(sk)) {
+                if (sk_can_gso(sk))
-                        /* Small frames wont use a full page:
+                        tmp = 0;
-                         * Payload will immediately follow tcp header.
+                else {
-                         */
-                        tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
-                } else {
                        int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
                        if (tmp >= pgbreak &&
@@ -987,86 +915,27 @@ static inline int select_size(const struct sock *sk, bool sg)
        return tmp;
 }
-void tcp_free_fastopen_req(struct tcp_sock *tp)
-{
-        if (tp->fastopen_req != NULL) {
-                kfree(tp->fastopen_req);
-                tp->fastopen_req = NULL;
-        }
-}
-static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int err, flags;
-        if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
-                return -EOPNOTSUPP;
-        if (tp->fastopen_req != NULL)
-                return -EALREADY; /* Another Fast Open is in progress */
-        tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
-                                   sk->sk_allocation);
-        if (unlikely(tp->fastopen_req == NULL))
-                return -ENOBUFS;
-        tp->fastopen_req->data = msg;
-        flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
-        err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
-                                    msg->msg_namelen, flags);
-        *size = tp->fastopen_req->copied;
-        tcp_free_fastopen_req(tp);
-        return err;
-}
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                size_t size)
 {
        struct iovec *iov;
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-        int iovlen, flags, err, copied = 0;
+        int iovlen, flags;
-        int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
+        int mss_now, size_goal;
-        bool sg;
+        int sg, err, copied;
        long timeo;
        lock_sock(sk);
        flags = msg->msg_flags;
-        if (flags & MSG_FASTOPEN) {
-                err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
-                if (err == -EINPROGRESS && copied_syn > 0)
-                        goto out;
-                else if (err)
-                        goto out_err;
-                offset = copied_syn;
-        }
        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-        /* Wait for a connection to finish. One exception is TCP Fast Open
+        /* Wait for a connection to finish. */
-         * (passive side) where data is allowed to be sent before a connection
+        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
-         * is fully established.
-         */
-        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
-            !tcp_passive_fastopen(sk)) {
                if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
-                        goto do_error;
-        }
-        if (unlikely(tp->repair)) {
-                if (tp->repair_queue == TCP_RECV_QUEUE) {
-                        copied = tcp_send_rcvq(sk, msg, size);
-                        goto out;
-                }
-                err = -EINVAL;
-                if (tp->repair_queue == TCP_NO_QUEUE)
                        goto out_err;
-                /* 'common' sending to sendq */
-        }
        /* This should be in poll */
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
@@ -1081,22 +950,13 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto out_err;
-        sg = !!(sk->sk_route_caps & NETIF_F_SG);
+        sg = sk->sk_route_caps & NETIF_F_SG;
        while (--iovlen >= 0) {
                size_t seglen = iov->iov_len;
                unsigned char __user *from = iov->iov_base;
                iov++;
-                if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
-                        if (offset >= seglen) {
-                                offset -= seglen;
-                                continue;
-                        }
-                        seglen -= offset;
-                        from += offset;
-                        offset = 0;
-                }
                while (seglen > 0) {
                        int copy = 0;
@@ -1139,54 +999,86 @@ new_segment:
                                copy = seglen;
                        /* Where to copy to? */
-                        if (skb_availroom(skb) > 0) {
+                        if (skb_tailroom(skb) > 0) {
                                /* We have some space in skb head. Superb! */
-                                copy = min_t(int, copy, skb_availroom(skb));
+                                if (copy > skb_tailroom(skb))
+                                        copy = skb_tailroom(skb);
                                err = skb_add_data_nocache(sk, skb, from, copy);
                                if (err)
                                        goto do_fault;
                        } else {
-                                bool merge = true;
+                                int merge = 0;
                                int i = skb_shinfo(skb)->nr_frags;
-                                struct page_frag *pfrag = sk_page_frag(sk);
+                                struct page *page = TCP_PAGE(sk);
+                                int off = TCP_OFF(sk);
-                                if (!sk_page_frag_refill(sk, pfrag))
-                                        goto wait_for_memory;
+                                if (skb_can_coalesce(skb, i, page, off) &&
+                                    off != PAGE_SIZE) {
-                                if (!skb_can_coalesce(skb, i, pfrag->page,
+                                        /* We can extend the last page
-                                                      pfrag->offset)) {
+                                         * fragment. */
-                                        if (i == MAX_SKB_FRAGS || !sg) {
+                                        merge = 1;
-                                                tcp_mark_push(tp, skb);
+                                } else if (i == MAX_SKB_FRAGS || !sg) {
-                                                goto new_segment;
+                                        /* Need to add new fragment and cannot
+                                         * do this because interface is non-SG,
+                                         * or because all the page slots are
+                                         * busy. */
+                                        tcp_mark_push(tp, skb);
+                                        goto new_segment;
+                                } else if (page) {
+                                        if (off == PAGE_SIZE) {
+                                                put_page(page);
+                                                TCP_PAGE(sk) = page = NULL;
+                                                off = 0;
                                        }
-                                        merge = false;
+                                } else
-                                }
+                                        off = 0;
-                                copy = min_t(int, copy, pfrag->size - pfrag->offset);
+                                if (copy > PAGE_SIZE - off)
+                                        copy = PAGE_SIZE - off;
                                if (!sk_wmem_schedule(sk, copy))
                                        goto wait_for_memory;
+                                if (!page) {
+                                        /* Allocate new cache page. */
+                                        if (!(page = sk_stream_alloc_page(sk)))
+                                                goto wait_for_memory;
+                                }
+                                /* Time to copy data. We are close to
+                                 * the end! */
                                err = skb_copy_to_page_nocache(sk, from, skb,
-                                                               pfrag->page,
+                                                               page, off, copy);
-                                                               pfrag->offset,
+                                if (err) {
-                                                               copy);
+                                        /* If this page was new, give it to the
-                                if (err)
+                                         * socket so it does not get leaked.
+                                         */
+                                        if (!TCP_PAGE(sk)) {
+                                                TCP_PAGE(sk) = page;
+                                                TCP_OFF(sk) = 0;
+                                        }
                                        goto do_error;
+                                }
                                /* Update the skb. */
                                if (merge) {
-                                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+                                        skb_shinfo(skb)->frags[i - 1].size +=
+                                                                        copy;
                                } else {
-                                        skb_fill_page_desc(skb, i, pfrag->page,
+                                        skb_fill_page_desc(skb, i, page, off, copy);
-                                                           pfrag->offset, copy);
+                                        if (TCP_PAGE(sk)) {
-                                        get_page(pfrag->page);
+                                                get_page(page);
+                                        } else if (off + copy < PAGE_SIZE) {
+                                                get_page(page);
+                                                TCP_PAGE(sk) = page;
+                                        }
                                }
-                                pfrag->offset += copy;
+                                TCP_OFF(sk) = off + copy;
                        }
                        if (!copied)
-                                TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+                                TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
                        tp->write_seq += copy;
                        TCP_SKB_CB(skb)->end_seq += copy;
@@ -1197,7 +1089,7 @@ new_segment:
                        if ((seglen -= copy) == 0 && iovlen == 0)
                                goto out;
-                        if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
+                        if (skb->len < max || (flags & MSG_OOB))
                                continue;
                        if (forced_push(tp)) {
@@ -1224,7 +1116,10 @@ out:
        if (copied)
                tcp_push(sk, flags, mss_now, tp->nonagle);
        release_sock(sk);
-        return copied + copied_syn;
+        if (copied > 0)
+                uid_stat_tcp_snd(current_uid(), copied);
+        return copied;
 do_fault:
        if (!skb->len) {
@@ -1237,7 +1132,7 @@ do_fault:
        }
 do_error:
-        if (copied + copied_syn)
+        if (copied)
                goto out;
 out_err:
        err = sk_stream_error(sk, flags, err);
@@ -1295,24 +1190,6 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
        return -EAGAIN;
 }
-static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
-{
-        struct sk_buff *skb;
-        int copied = 0, err = 0;
-        /* XXX -- need to support SO_PEEK_OFF */
-        skb_queue_walk(&sk->sk_write_queue, skb) {
-                err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
-                if (err)
-                        break;
-                copied += skb->len;
-        }
-        return err ?: copied;
-}
 /* Clean up the receive buffer for full frames taken by the user,
 * then send an ACK if necessary.  COPIED is the number of bytes
 * tcp_recvmsg has given to the user so far, it speeds up the
@@ -1322,13 +1199,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        bool time_to_ack = false;
+        int time_to_ack = 0;
+#if TCP_DEBUG
        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
        WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
             "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
             tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+#endif
        if (inet_csk_ack_scheduled(sk)) {
                const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1348,7 +1227,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
                       !icsk->icsk_ack.pingpong)) &&
                      !atomic_read(&sk->sk_rmem_alloc)))
-                        time_to_ack = true;
+                        time_to_ack = 1;
        }
        /* We send an ACK if we can now advertise a non-zero window
@@ -1370,7 +1249,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
                         * "Lots" means "at least twice" here.
                         */
                        if (new_window && new_window >= 2 * rcv_window_now)
-                                time_to_ack = true;
+                                time_to_ack = 1;
                }
        }
        if (time_to_ack)
@@ -1428,12 +1307,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
 }
 #endif
-static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
        struct sk_buff *skb;
        u32 offset;
-        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+        skb_queue_walk(&sk->sk_receive_queue, skb) {
                offset = seq - TCP_SKB_CB(skb)->seq;
                if (tcp_hdr(skb)->syn)
                        offset--;
@@ -1441,11 +1320,6 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
                        *off = offset;
                        return skb;
                }
-                /* This looks weird, but this can happen if TCP collapsing
-                 * splitted a fat GRO packet, while we released socket lock
-                 * in skb_splice_bits()
-                 */
-                sk_eat_skb(sk, skb, false);
        }
        return NULL;
 }
@@ -1487,7 +1361,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                                        break;
                        }
                        used = recv_actor(desc, skb, offset, len);
-                        if (used <= 0) {
+                        if (used < 0) {
                                if (!copied)
                                        copied = used;
                                break;
@@ -1496,26 +1370,22 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                                copied += used;
                                offset += used;
                        }
-                        /* If recv_actor drops the lock (e.g. TCP splice
+                        /*
+                         * If recv_actor drops the lock (e.g. TCP splice
                         * receive) the skb pointer might be invalid when
                         * getting here: tcp_collapse might have deleted it
                         * while aggregating skbs from the socket queue.
                         */
-                        skb = tcp_recv_skb(sk, seq - 1, &offset);
+                        skb = tcp_recv_skb(sk, seq-1, &offset);
-                        if (!skb)
+                        if (!skb || (offset+1 != skb->len))
                                break;
-                        /* TCP coalescing might have appended data to the skb.
-                         * Try to splice more frags
-                         */
-                        if (offset + 1 != skb->len)
-                                continue;
                }
                if (tcp_hdr(skb)->fin) {
-                        sk_eat_skb(sk, skb, false);
+                        sk_eat_skb(sk, skb, 0);
                        ++seq;
                        break;
                }
-                sk_eat_skb(sk, skb, false);
+                sk_eat_skb(sk, skb, 0);
                if (!desc->count)
                        break;
                tp->copied_seq = seq;
@@ -1526,9 +1396,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
        /* Clean up data we have read: This will do ACK frames. */
        if (copied > 0) {
-                tcp_recv_skb(sk, seq, &offset);
                tcp_cleanup_rbuf(sk, copied);
+                uid_stat_tcp_rcv(current_uid(), copied);
        }
        return copied;
 }
 EXPORT_SYMBOL(tcp_read_sock);
@@ -1553,7 +1424,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        int target;             /* Read at least this many bytes */
        long timeo;
        struct task_struct *user_recv = NULL;
-        bool copied_early = false;
+        int copied_early = 0;
        struct sk_buff *skb;
        u32 urg_hole = 0;
@@ -1569,21 +1440,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        if (flags & MSG_OOB)
                goto recv_urg;
-        if (unlikely(tp->repair)) {
-                err = -EPERM;
-                if (!(flags & MSG_PEEK))
-                        goto out;
-                if (tp->repair_queue == TCP_SEND_QUEUE)
-                        goto recv_sndq;
-                err = -EINVAL;
-                if (tp->repair_queue == TCP_NO_QUEUE)
-                        goto out;
-                /* 'common' recv queue MSG_PEEK-ing */
-        }
        seq = &tp->copied_seq;
        if (flags & MSG_PEEK) {
                peek_seq = tp->copied_seq;
@@ -1604,7 +1460,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                if ((available < target) &&
                    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
                    !sysctl_tcp_low_latency &&
-                    net_dma_find_channel()) {
+                    dma_find_channel(DMA_MEMCPY)) {
                        preempt_enable_no_resched();
                        tp->ucopy.pinned_list =
                                        dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1745,14 +1601,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                }
 #ifdef CONFIG_NET_DMA
-                if (tp->ucopy.dma_chan) {
+                if (tp->ucopy.dma_chan)
-                        if (tp->rcv_wnd == 0 &&
+                        dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-                            !skb_queue_empty(&sk->sk_async_wait_queue)) {
-                                tcp_service_net_dma(sk, true);
-                                tcp_cleanup_rbuf(sk, copied);
-                        } else
-                                dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-                }
 #endif
                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
@@ -1791,9 +1641,9 @@ do_prequeue:
                }
                if ((flags & MSG_PEEK) &&
                    (peek_seq - copied - urg_hole != tp->copied_seq)) {
-                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+                        if (net_ratelimit())
-                                            current->comm,
+                                printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
-                                            task_pid_nr(current));
+                                       current->comm, task_pid_nr(current));
                        peek_seq = tp->copied_seq;
                }
                continue;
@@ -1825,7 +1675,7 @@ do_prequeue:
                if (!(flags & MSG_TRUNC)) {
 #ifdef CONFIG_NET_DMA
                        if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-                                tp->ucopy.dma_chan = net_dma_find_channel();
+                                tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
                        if (tp->ucopy.dma_chan) {
                                tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1835,8 +1685,7 @@ do_prequeue:
                                if (tp->ucopy.dma_cookie < 0) {
-                                        pr_alert("%s: dma_cookie < 0\n",
+                                        printk(KERN_ALERT "dma_cookie < 0\n");
-                                                 __func__);
                                        /* Exception. Bailout! */
                                        if (!copied)
@@ -1847,7 +1696,7 @@ do_prequeue:
                                dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
                                if ((offset + used) == skb->len)
-                                        copied_early = true;
+                                        copied_early = 1;
                        } else
 #endif
@@ -1881,7 +1730,7 @@ skip_copy:
                        goto found_fin_ok;
                if (!(flags & MSG_PEEK)) {
                        sk_eat_skb(sk, skb, copied_early);
-                        copied_early = false;
+                        copied_early = 0;
                }
                continue;
@@ -1890,7 +1739,7 @@ skip_copy:
                ++*seq;
                if (!(flags & MSG_PEEK)) {
                        sk_eat_skb(sk, skb, copied_early);
-                        copied_early = false;
+                        copied_early = 0;
                }
                break;
        } while (len > 0);
@@ -1932,6 +1781,9 @@ skip_copy:
        tcp_cleanup_rbuf(sk, copied);
        release_sock(sk);
+        if (copied > 0)
+                uid_stat_tcp_rcv(current_uid(), copied);
        return copied;
 out:
@@ -1940,10 +1792,8 @@ out:
 recv_urg:
        err = tcp_recv_urg(sk, msg, len, flags);
-        goto out;
+        if (err > 0)
+                uid_stat_tcp_rcv(current_uid(), err);
-recv_sndq:
-        err = tcp_peek_sndq(sk, msg, len);
        goto out;
 }
 EXPORT_SYMBOL(tcp_recvmsg);
@@ -2041,20 +1891,6 @@ void tcp_shutdown(struct sock *sk, int how)
 }
 EXPORT_SYMBOL(tcp_shutdown);
-bool tcp_check_oom(struct sock *sk, int shift)
-{
-        bool too_many_orphans, out_of_socket_memory;
-        too_many_orphans = tcp_too_many_orphans(sk, shift);
-        out_of_socket_memory = tcp_out_of_memory(sk);
-        if (too_many_orphans)
-                net_info_ratelimited("too many orphaned sockets\n");
-        if (out_of_socket_memory)
-                net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
-        return too_many_orphans || out_of_socket_memory;
-}
 void tcp_close(struct sock *sk, long timeout)
 {
        struct sk_buff *skb;
@@ -2097,9 +1933,7 @@ void tcp_close(struct sock *sk, long timeout)
         * advertise a zero window, then kill -9 the FTP client, wheee...
         * Note: timeout is always zero in such a case.
         */
-        if (unlikely(tcp_sk(sk)->repair)) {
+        if (data_was_unread) {
-                sk->sk_prot->disconnect(sk, 0);
-        } else if (data_was_unread) {
                /* Unread data was tossed, zap the connection. */
                NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
                tcp_set_state(sk, TCP_CLOSE);
@@ -2133,10 +1967,6 @@ void tcp_close(struct sock *sk, long timeout)
                 * they look as CLOSING or LAST_ACK for Linux)
                 * Probably, I missed some more holelets.
                 *                                              --ANK
-                 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
-                 * in a single packet! (May consider it later but will
-                 * probably need API support or TCP_CORK SYN-ACK until
-                 * data is written and socket is closed.)
                 */
                tcp_send_fin(sk);
        }
@@ -2200,7 +2030,10 @@ adjudge_to_death:
        }
        if (sk->sk_state != TCP_CLOSE) {
                sk_mem_reclaim(sk);
-                if (tcp_check_oom(sk, 0)) {
+                if (tcp_too_many_orphans(sk, 0)) {
+                        if (net_ratelimit())
+                                printk(KERN_INFO "TCP: too many of orphaned "
+                                       "sockets\n");
                        tcp_set_state(sk, TCP_CLOSE);
                        tcp_send_active_reset(sk, GFP_ATOMIC);
                        NET_INC_STATS_BH(sock_net(sk),
@@ -2208,16 +2041,8 @@ adjudge_to_death:
                }
        }
-        if (sk->sk_state == TCP_CLOSE) {
+        if (sk->sk_state == TCP_CLOSE)
-                struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
-                /* We could get here with a non-NULL req if the socket is
-                 * aborted (e.g., closed with unread data) before 3WHS
-                 * finishes.
-                 */
-                if (req != NULL)
-                        reqsk_fastopen_remove(sk, req, false);
                inet_csk_destroy_sock(sk);
-        }
        /* Otherwise, socket is reprieved until protocol close. */
 out:
@@ -2229,7 +2054,7 @@ EXPORT_SYMBOL(tcp_close);
 /* These states need RST on ABORT according to RFC793 */
-static inline bool tcp_need_reset(int state)
+static inline int tcp_need_reset(int state)
 {
        return (1 << state) &
               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2250,8 +2075,6 @@ int tcp_disconnect(struct sock *sk, int flags)
        /* ABORT function of RFC793 */
        if (old_state == TCP_LISTEN) {
                inet_csk_listen_stop(sk);
-        } else if (unlikely(tp->repair)) {
-                sk->sk_err = ECONNABORTED;
        } else if (tcp_need_reset(old_state) ||
                   (tp->snd_nxt != tp->write_seq &&
                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2303,68 +2126,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 }
 EXPORT_SYMBOL(tcp_disconnect);
-void tcp_sock_destruct(struct sock *sk)
-{
-        inet_sock_destruct(sk);
-        kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
-}
-static inline bool tcp_can_repair_sock(const struct sock *sk)
-{
-        return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
-                ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
-}
-static int tcp_repair_options_est(struct tcp_sock *tp,
-                struct tcp_repair_opt __user *optbuf, unsigned int len)
-{
-        struct tcp_repair_opt opt;
-        while (len >= sizeof(opt)) {
-                if (copy_from_user(&opt, optbuf, sizeof(opt)))
-                        return -EFAULT;
-                optbuf++;
-                len -= sizeof(opt);
-                switch (opt.opt_code) {
-                case TCPOPT_MSS:
-                        tp->rx_opt.mss_clamp = opt.opt_val;
-                        break;
-                case TCPOPT_WINDOW:
-                        {
-                                u16 snd_wscale = opt.opt_val & 0xFFFF;
-                                u16 rcv_wscale = opt.opt_val >> 16;
-                                if (snd_wscale > 14 || rcv_wscale > 14)
-                                        return -EFBIG;
-                                tp->rx_opt.snd_wscale = snd_wscale;
-                                tp->rx_opt.rcv_wscale = rcv_wscale;
-                                tp->rx_opt.wscale_ok = 1;
-                        }
-                        break;
-                case TCPOPT_SACK_PERM:
-                        if (opt.opt_val != 0)
-                                return -EINVAL;
-                        tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
-                        if (sysctl_tcp_fack)
-                                tcp_enable_fack(tp);
-                        break;
-                case TCPOPT_TIMESTAMP:
-                        if (opt.opt_val != 0)
-                                return -EINVAL;
-                        tp->rx_opt.tstamp_ok = 1;
-                        break;
-                }
-        }
-        return 0;
-}
 /*
 *      Socket option code for TCP.
 */
@@ -2535,55 +2296,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                        err = -EINVAL;
                else
                        tp->thin_dupack = val;
-                        if (tp->thin_dupack)
-                                tcp_disable_early_retrans(tp);
-                break;
-        case TCP_REPAIR:
-                if (!tcp_can_repair_sock(sk))
-                        err = -EPERM;
-                else if (val == 1) {
-                        tp->repair = 1;
-                        sk->sk_reuse = SK_FORCE_REUSE;
-                        tp->repair_queue = TCP_NO_QUEUE;
-                } else if (val == 0) {
-                        tp->repair = 0;
-                        sk->sk_reuse = SK_NO_REUSE;
-                        tcp_send_window_probe(sk);
-                } else
-                        err = -EINVAL;
-                break;
-        case TCP_REPAIR_QUEUE:
-                if (!tp->repair)
-                        err = -EPERM;
-                else if (val < TCP_QUEUES_NR)
-                        tp->repair_queue = val;
-                else
-                        err = -EINVAL;
-                break;
-        case TCP_QUEUE_SEQ:
-                if (sk->sk_state != TCP_CLOSE)
-                        err = -EPERM;
-                else if (tp->repair_queue == TCP_SEND_QUEUE)
-                        tp->write_seq = val;
-                else if (tp->repair_queue == TCP_RECV_QUEUE)
-                        tp->rcv_nxt = val;
-                else
-                        err = -EINVAL;
-                break;
-        case TCP_REPAIR_OPTIONS:
-                if (!tp->repair)
-                        err = -EINVAL;
-                else if (sk->sk_state == TCP_ESTABLISHED)
-                        err = tcp_repair_options_est(tp,
-                                        (struct tcp_repair_opt __user *)optval,
-                                        optlen);
-                else
-                        err = -EPERM;
                break;
        case TCP_CORK:
@@ -2698,18 +2410,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                /* Cap the max timeout in ms TCP will retry/retrans
                 * before giving up and aborting (ETIMEDOUT) a connection.
                 */
-                if (val < 0)
+                icsk->icsk_user_timeout = msecs_to_jiffies(val);
-                        err = -EINVAL;
-                else
-                        icsk->icsk_user_timeout = msecs_to_jiffies(val);
-                break;
-        case TCP_FASTOPEN:
-                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
-                    TCPF_LISTEN)))
-                        err = fastopen_init_queue(sk, val);
-                else
-                        err = -EINVAL;
                break;
        default:
                err = -ENOPROTOOPT;
@@ -2723,7 +2424,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   unsigned int optlen)
 {
-        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        if (level != SOL_TCP)
                return icsk->icsk_af_ops->setsockopt(sk, level, optname,
@@ -2745,9 +2446,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
 #endif
 /* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(const struct sock *sk, struct tcp_info *info)
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now = tcp_time_stamp;
@@ -2769,12 +2470,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
        }
-        if (tp->ecn_flags & TCP_ECN_OK)
+        if (tp->ecn_flags&TCP_ECN_OK)
                info->tcpi_options |= TCPI_OPT_ECN;
-        if (tp->ecn_flags & TCP_ECN_SEEN)
-                info->tcpi_options |= TCPI_OPT_ECN_SEEN;
-        if (tp->syn_data_acked)
-                info->tcpi_options |= TCPI_OPT_SYN_DATA;
        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
        info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
@@ -2832,8 +2529,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                val = tp->mss_cache;
                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        val = tp->rx_opt.user_mss;
-                if (tp->repair)
-                        val = tp->rx_opt.mss_clamp;
                break;
        case TCP_NODELAY:
                val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2936,26 +2631,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                val = tp->thin_dupack;
                break;
-        case TCP_REPAIR:
-                val = tp->repair;
-                break;
-        case TCP_REPAIR_QUEUE:
-                if (tp->repair)
-                        val = tp->repair_queue;
-                else
-                        return -EINVAL;
-                break;
-        case TCP_QUEUE_SEQ:
-                if (tp->repair_queue == TCP_SEND_QUEUE)
-                        val = tp->write_seq;
-                else if (tp->repair_queue == TCP_RECV_QUEUE)
-                        val = tp->rcv_nxt;
-                else
-                        return -EINVAL;
-                break;
        case TCP_USER_TIMEOUT:
                val = jiffies_to_msecs(icsk->icsk_user_timeout);
                break;
@@ -2994,12 +2669,11 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL(compat_tcp_getsockopt);
 #endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
-        netdev_features_t features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        struct tcphdr *th;
-        unsigned int thlen;
+        unsigned thlen;
        unsigned int seq;
        __be32 delta;
        unsigned int oldlen;
@@ -3198,25 +2872,26 @@ EXPORT_SYMBOL(tcp_gro_complete);
 #ifdef CONFIG_TCP_MD5SIG
 static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool;
+static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
+static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
 {
        int cpu;
        for_each_possible_cpu(cpu) {
-                struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
+                struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
+                if (p) {
-                if (p->md5_desc.tfm)
+                        if (p->md5_desc.tfm)
-                        crypto_free_hash(p->md5_desc.tfm);
+                                crypto_free_hash(p->md5_desc.tfm);
+                        kfree(p);
+                }
        }
        free_percpu(pool);
 }
 void tcp_free_md5sig_pool(void)
 {
-        struct tcp_md5sig_pool __percpu *pool = NULL;
+        struct tcp_md5sig_pool * __percpu *pool = NULL;
        spin_lock_bh(&tcp_md5sig_pool_lock);
        if (--tcp_md5sig_users == 0) {
@@ -3229,24 +2904,30 @@ void tcp_free_md5sig_pool(void)
 }
 EXPORT_SYMBOL(tcp_free_md5sig_pool);
-static struct tcp_md5sig_pool __percpu *
+static struct tcp_md5sig_pool * __percpu *
 __tcp_alloc_md5sig_pool(struct sock *sk)
 {
        int cpu;
-        struct tcp_md5sig_pool __percpu *pool;
+        struct tcp_md5sig_pool * __percpu *pool;
-        pool = alloc_percpu(struct tcp_md5sig_pool);
+        pool = alloc_percpu(struct tcp_md5sig_pool *);
        if (!pool)
                return NULL;
        for_each_possible_cpu(cpu) {
+                struct tcp_md5sig_pool *p;
                struct crypto_hash *hash;
+                p = kzalloc(sizeof(*p), sk->sk_allocation);
+                if (!p)
+                        goto out_free;
+                *per_cpu_ptr(pool, cpu) = p;
                hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
                if (!hash || IS_ERR(hash))
                        goto out_free;
-                per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
+                p->md5_desc.tfm = hash;
        }
        return pool;
 out_free:
@@ -3254,16 +2935,16 @@ out_free:
        return NULL;
 }
-struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
+struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
 {
-        struct tcp_md5sig_pool __percpu *pool;
+        struct tcp_md5sig_pool * __percpu *pool;
-        bool alloc = false;
+        int alloc = 0;
 retry:
        spin_lock_bh(&tcp_md5sig_pool_lock);
        pool = tcp_md5sig_pool;
        if (tcp_md5sig_users++ == 0) {
-                alloc = true;
+                alloc = 1;
                spin_unlock_bh(&tcp_md5sig_pool_lock);
        } else if (!pool) {
                tcp_md5sig_users--;
@@ -3275,7 +2956,7 @@ retry:
        if (alloc) {
                /* we cannot hold spinlock here because this may sleep. */
-                struct tcp_md5sig_pool __percpu *p;
+                struct tcp_md5sig_pool * __percpu *p;
                p = __tcp_alloc_md5sig_pool(sk);
                spin_lock_bh(&tcp_md5sig_pool_lock);
@@ -3308,7 +2989,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
 */
 struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 {
-        struct tcp_md5sig_pool __percpu *p;
+        struct tcp_md5sig_pool * __percpu *p;
        local_bh_disable();
@@ -3319,7 +3000,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
        spin_unlock(&tcp_md5sig_pool_lock);
        if (p)
-                return this_cpu_ptr(p);
+                return *this_cpu_ptr(p);
        local_bh_enable();
        return NULL;
@@ -3334,32 +3015,30 @@ void tcp_put_md5sig_pool(void)
 EXPORT_SYMBOL(tcp_put_md5sig_pool);
 int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
-                        const struct tcphdr *th)
+                        struct tcphdr *th)
 {
        struct scatterlist sg;
-        struct tcphdr hdr;
        int err;
-        /* We are not allowed to change tcphdr, make a local copy */
+        __sum16 old_checksum = th->check;
-        memcpy(&hdr, th, sizeof(hdr));
+        th->check = 0;
-        hdr.check = 0;
        /* options aren't included in the hash */
-        sg_init_one(&sg, &hdr, sizeof(hdr));
+        sg_init_one(&sg, th, sizeof(struct tcphdr));
-        err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
+        err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
+        th->check = old_checksum;
        return err;
 }
 EXPORT_SYMBOL(tcp_md5_hash_header);
 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
-                          const struct sk_buff *skb, unsigned int header_len)
+                          struct sk_buff *skb, unsigned header_len)
 {
        struct scatterlist sg;
        const struct tcphdr *tp = tcp_hdr(skb);
        struct hash_desc *desc = &hp->md5_desc;
-        unsigned int i;
+        unsigned i;
-        const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+        const unsigned head_data_len = skb_headlen(skb) > header_len ?
-                                           skb_headlen(skb) - header_len : 0;
+                                       skb_headlen(skb) - header_len : 0;
        const struct skb_shared_info *shi = skb_shinfo(skb);
        struct sk_buff *frag_iter;
@@ -3371,9 +3050,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
        for (i = 0; i < shi->nr_frags; ++i) {
                const struct skb_frag_struct *f = &shi->frags[i];
-                struct page *page = skb_frag_page(f);
+                sg_set_page(&sg, f->page, f->size, f->page_offset);
-                sg_set_page(&sg, page, skb_frag_size(f), f->page_offset);
+                if (crypto_hash_update(desc, &sg, f->size))
-                if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
                        return 1;
        }
@@ -3385,7 +3063,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 }
 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
-int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
 {
        struct scatterlist sg;
@@ -3396,7 +3074,8 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 #endif
-/* Each Responder maintains up to two secret values concurrently for
+/**
+ * Each Responder maintains up to two secret values concurrently for
 * efficient secret rollover.  Each secret value has 4 states:
 *
 * Generating.  (tcp_secret_generating != tcp_secret_primary)
@@ -3526,15 +3205,11 @@ EXPORT_SYMBOL(tcp_cookie_generator);
 void tcp_done(struct sock *sk)
 {
-        struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
        tcp_set_state(sk, TCP_CLOSE);
        tcp_clear_xmit_timers(sk);
-        if (req != NULL)
-                reqsk_fastopen_remove(sk, req, false);
        sk->sk_shutdown = SHUTDOWN_MASK;
@@ -3550,34 +3225,18 @@ extern struct tcp_congestion_ops tcp_reno;
 static __initdata unsigned long thash_entries;
 static int __init set_thash_entries(char *str)
 {
-        ssize_t ret;
        if (!str)
                return 0;
+        thash_entries = simple_strtoul(str, &str, 0);
-        ret = kstrtoul(str, 0, &thash_entries);
-        if (ret)
-                return 0;
        return 1;
 }
 __setup("thash_entries=", set_thash_entries);
-void tcp_init_mem(struct net *net)
-{
-        unsigned long limit = nr_free_buffer_pages() / 8;
-        limit = max(limit, 128UL);
-        net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
-        net->ipv4.sysctl_tcp_mem[1] = limit;
-        net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
-}
 void __init tcp_init(void)
 {
        struct sk_buff *skb = NULL;
        unsigned long limit;
-        int max_rshare, max_wshare, cnt;
+        int i, max_share, cnt;
-        unsigned int i;
        unsigned long jiffy = jiffies;
        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3598,11 +3257,11 @@ void __init tcp_init(void)
                alloc_large_system_hash("TCP established",
                                        sizeof(struct inet_ehash_bucket),
                                        thash_entries,
-                                        17, /* one slot per 128 KB of memory */
+                                        (totalram_pages >= 128 * 1024) ?
+                                        13 : 15,
                                        0,
                                        NULL,
                                        &tcp_hashinfo.ehash_mask,
-                                        0,
                                        thash_entries ? 0 : 512 * 1024);
        for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
                INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
@@ -3614,13 +3273,13 @@ void __init tcp_init(void)
                alloc_large_system_hash("TCP bind",
                                        sizeof(struct inet_bind_hashbucket),
                                        tcp_hashinfo.ehash_mask + 1,
-                                        17, /* one slot per 128 KB of memory */
+                                        (totalram_pages >= 128 * 1024) ?
+                                        13 : 15,
                                        0,
                                        &tcp_hashinfo.bhash_size,
                                        NULL,
-                                        0,
                                        64 * 1024);
-        tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+        tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
@@ -3633,24 +3292,27 @@ void __init tcp_init(void)
        sysctl_tcp_max_orphans = cnt / 2;
        sysctl_max_syn_backlog = max(128, cnt / 256);
-        tcp_init_mem(&init_net);
+        limit = nr_free_buffer_pages() / 8;
+        limit = max(limit, 128UL);
+        sysctl_tcp_mem[0] = limit / 4 * 3;
+        sysctl_tcp_mem[1] = limit;
+        sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
        /* Set per-socket limits to no more than 1/128 the pressure threshold */
-        limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+        limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
-        max_wshare = min(4UL*1024*1024, limit);
+        max_share = min(4UL*1024*1024, limit);
-        max_rshare = min(6UL*1024*1024, limit);
        sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
        sysctl_tcp_wmem[1] = 16*1024;
-        sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+        sysctl_tcp_wmem[2] = max(64*1024, max_share);
        sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
        sysctl_tcp_rmem[1] = 87380;
-        sysctl_tcp_rmem[2] = max(87380, max_rshare);
+        sysctl_tcp_rmem[2] = max(87380, max_share);
-        pr_info("Hash tables configured (established %u bind %u)\n",
+        printk(KERN_INFO "TCP: Hash tables configured "
-                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+               "(established %u bind %u)\n",
+               tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
-        tcp_metrics_init();
        tcp_register_congestion_control(&tcp_reno);
@@ -3662,5 +3324,108 @@ void __init tcp_init(void)
        tcp_secret_primary = &tcp_secret_one;
        tcp_secret_retiring = &tcp_secret_two;
        tcp_secret_secondary = &tcp_secret_two;
-        tcp_tasklet_init();
+}
+static int tcp_is_local(struct net *net, __be32 addr) {
+        struct rtable *rt;
+        struct flowi4 fl4 = { .daddr = addr };
+        rt = ip_route_output_key(net, &fl4);
+        if (IS_ERR_OR_NULL(rt))
+                return 0;
+        return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int tcp_is_local6(struct net *net, struct in6_addr *addr) {
+        struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
+        return rt6 && rt6->rt6i_dev && (rt6->rt6i_dev->flags & IFF_LOOPBACK);
+}
+#endif
+/*
+ * tcp_nuke_addr - destroy all sockets on the given local address
+ * if local address is the unspecified address (0.0.0.0 or ::), destroy all
+ * sockets with local addresses that are not configured.
+ */
+int tcp_nuke_addr(struct net *net, struct sockaddr *addr)
+{
+        int family = addr->sa_family;
+        unsigned int bucket;
+        struct in_addr *in;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+        struct in6_addr *in6;
+#endif
+        if (family == AF_INET) {
+                in = &((struct sockaddr_in *)addr)->sin_addr;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+        } else if (family == AF_INET6) {
+                in6 = &((struct sockaddr_in6 *)addr)->sin6_addr;
+#endif
+        } else {
+                return -EAFNOSUPPORT;
+        }
+        for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
+                struct hlist_nulls_node *node;
+                struct sock *sk;
+                spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
+restart:
+                spin_lock_bh(lock);
+                sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
+                        struct inet_sock *inet = inet_sk(sk);
+                        if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
+                                continue;
+                        if (sock_flag(sk, SOCK_DEAD))
+                                continue;
+                        if (family == AF_INET) {
+                                __be32 s4 = inet->inet_rcv_saddr;
+                                if (s4 == LOOPBACK4_IPV6)
+                                        continue;
+                                if (in->s_addr != s4 &&
+                                    !(in->s_addr == INADDR_ANY &&
+                                      !tcp_is_local(net, s4)))
+                                        continue;
+                        }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+                        if (family == AF_INET6) {
+                                struct in6_addr *s6;
+                                if (!inet->pinet6)
+                                        continue;
+                                s6 = &inet->pinet6->rcv_saddr;
+                                if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)
+                                        continue;
+                                if (!ipv6_addr_equal(in6, s6) &&
+                                    !(ipv6_addr_equal(in6, &in6addr_any) &&
+                                      !tcp_is_local6(net, s6)))
+                                continue;
+                        }
+#endif
+                        sock_hold(sk);
+                        spin_unlock_bh(lock);
+                        local_bh_disable();
+                        bh_lock_sock(sk);
+                        sk->sk_err = ETIMEDOUT;
+                        sk->sk_error_report(sk);
+                        tcp_done(sk);
+                        bh_unlock_sock(sk);
+                        local_bh_enable();
+                        sock_put(sk);
+                        goto restart;
+                }
+                spin_unlock_bh(lock);
+        }
+        return 0;
 }
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f45e1c24244..6187eb4d1dc 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -63,6 +63,7 @@ static inline void bictcp_reset(struct bictcp *ca)
 {
        ca->cnt = 0;
        ca->last_max_cwnd = 0;
+        ca->loss_cwnd = 0;
        ca->last_cwnd = 0;
        ca->last_time = 0;
        ca->epoch_start = 0;
@@ -71,11 +72,7 @@ static inline void bictcp_reset(struct bictcp *ca)
 static void bictcp_init(struct sock *sk)
 {
-        struct bictcp *ca = inet_csk_ca(sk);
+        bictcp_reset(inet_csk_ca(sk));
-        bictcp_reset(ca);
-        ca->loss_cwnd = 0;
        if (initial_ssthresh)
                tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
@@ -130,7 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
        }
        /* if in slow start or link utilization is very low */
-        if (ca->last_max_cwnd == 0) {
+        if (ca->loss_cwnd == 0) {
                if (ca->cnt > 20) /* increase cwnd 5% per RTT */
                        ca->cnt = 20;
        }
@@ -188,7 +185,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct bictcp *ca = inet_csk_ca(sk);
-        return max(tp->snd_cwnd, ca->loss_cwnd);
+        return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
 static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 291f2ed7cc3..850c737e08e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,13 +1,11 @@
 /*
 * Plugable TCP congestion control support and newReno
 * congestion control.
- * Based on ideas from I/O scheduler support and Web100.
+ * Based on ideas from I/O scheduler suport and Web100.
 *
 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
 */
-#define pr_fmt(fmt) "TCP: " fmt
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/types.h>
@@ -43,17 +41,18 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
        /* all algorithms must implement ssthresh and cong_avoid ops */
        if (!ca->ssthresh || !ca->cong_avoid) {
-                pr_err("%s does not implement required ops\n", ca->name);
+                printk(KERN_ERR "TCP %s does not implement required ops\n",
+                       ca->name);
                return -EINVAL;
        }
        spin_lock(&tcp_cong_list_lock);
        if (tcp_ca_find(ca->name)) {
-                pr_notice("%s already registered\n", ca->name);
+                printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
                ret = -EEXIST;
        } else {
                list_add_tail_rcu(&ca->list, &tcp_cong_list);
-                pr_info("%s registered\n", ca->name);
+                printk(KERN_INFO "TCP %s registered\n", ca->name);
        }
        spin_unlock(&tcp_cong_list_lock);
@@ -259,8 +258,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
        if (!ca)
                err = -ENOENT;
-        else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+        else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
-                   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
                err = -EPERM;
        else if (!try_module_get(ca->owner))
@@ -281,21 +279,20 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 /* RFC2861 Check whether we are limited by application or congestion window
 * This is the inverse of cwnd check in tcp_tso_should_defer
 */
-bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 left;
        if (in_flight >= tp->snd_cwnd)
-                return true;
+                return 1;
        left = tp->snd_cwnd - in_flight;
        if (sk_can_gso(sk) &&
            left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
-            left * tp->mss_cache < sk->sk_gso_max_size &&
+            left * tp->mss_cache < sk->sk_gso_max_size)
-            left < sk->sk_gso_max_segs)
+                return 1;
-                return true;
+        return left <= tcp_max_burst(tp);
-        return left <= tcp_max_tso_deferred_mss(tp);
 }
 EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
@@ -309,7 +306,6 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
 void tcp_slow_start(struct tcp_sock *tp)
 {
        int cnt; /* increase in packets */
-        unsigned int delta = 0;
        /* RFC3465: ABC Slow start
         * Increase only after a full MSS of bytes is acked
@@ -336,9 +332,9 @@ void tcp_slow_start(struct tcp_sock *tp)
        tp->snd_cwnd_cnt += cnt;
        while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
                tp->snd_cwnd_cnt -= tp->snd_cwnd;
-                delta++;
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                        tp->snd_cwnd++;
        }
-        tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index a9077f441cb..f376b05cca8 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -107,6 +107,7 @@ static inline void bictcp_reset(struct bictcp *ca)
 {
        ca->cnt = 0;
        ca->last_max_cwnd = 0;
+        ca->loss_cwnd = 0;
        ca->last_cwnd = 0;
        ca->last_time = 0;
        ca->bic_origin_point = 0;
@@ -141,10 +142,7 @@ static inline void bictcp_hystart_reset(struct sock *sk)
 static void bictcp_init(struct sock *sk)
 {
-        struct bictcp *ca = inet_csk_ca(sk);
+        bictcp_reset(inet_csk_ca(sk));
-        bictcp_reset(ca);
-        ca->loss_cwnd = 0;
        if (hystart)
                bictcp_hystart_reset(sk);
@@ -277,7 +275,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
         * The initial growth of cubic function may be too conservative
         * when the available bandwidth is still unknown.
         */
-        if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+        if (ca->loss_cwnd == 0 && ca->cnt > 20)
                ca->cnt = 20;   /* increase cwnd 5% per RTT */
        /* TCP Friendly */
@@ -344,7 +342,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
 {
        struct bictcp *ca = inet_csk_ca(sk);
-        return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+        return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
 }
 static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index ed3f2ad42e0..939edb3b8e4 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,23 +34,11 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
                tcp_get_info(sk, info);
 }
-static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-                struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
-        inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
-}
-static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
-                struct inet_diag_req_v2 *req)
-{
-        return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
-}
 static const struct inet_diag_handler tcp_diag_handler = {
-        .dump            = tcp_diag_dump,
+        .idiag_hashinfo  = &tcp_hashinfo,
-        .dump_one        = tcp_diag_dump_one,
        .idiag_get_info  = tcp_diag_get_info,
-        .idiag_type      = IPPROTO_TCP,
+        .idiag_type      = TCPDIAG_GETSOCK,
+        .idiag_info_size = sizeof(struct tcp_info),
 };
 static int __init tcp_diag_init(void)
@@ -66,4 +54,4 @@ static void __exit tcp_diag_exit(void)
 module_init(tcp_diag_init);
 module_exit(tcp_diag_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
deleted file mode 100644
index 8f7ef0ad80e..00000000000
--- a/net/ipv4/tcp_fastopen.c
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/tcp.h>
-#include <linux/rcupdate.h>
-#include <linux/rculist.h>
-#include <net/inetpeer.h>
-#include <net/tcp.h>
-int sysctl_tcp_fastopen __read_mostly;
-struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
-static void tcp_fastopen_ctx_free(struct rcu_head *head)
-{
-        struct tcp_fastopen_context *ctx =
-            container_of(head, struct tcp_fastopen_context, rcu);
-        crypto_free_cipher(ctx->tfm);
-        kfree(ctx);
-}
-int tcp_fastopen_reset_cipher(void *key, unsigned int len)
-{
-        int err;
-        struct tcp_fastopen_context *ctx, *octx;
-        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
-        if (!ctx)
-                return -ENOMEM;
-        ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
-        if (IS_ERR(ctx->tfm)) {
-                err = PTR_ERR(ctx->tfm);
-error:          kfree(ctx);
-                pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
-                return err;
-        }
-        err = crypto_cipher_setkey(ctx->tfm, key, len);
-        if (err) {
-                pr_err("TCP: TFO cipher key error: %d\n", err);
-                crypto_free_cipher(ctx->tfm);
-                goto error;
-        }
-        memcpy(ctx->key, key, len);
-        spin_lock(&tcp_fastopen_ctx_lock);
-        octx = rcu_dereference_protected(tcp_fastopen_ctx,
-                                lockdep_is_held(&tcp_fastopen_ctx_lock));
-        rcu_assign_pointer(tcp_fastopen_ctx, ctx);
-        spin_unlock(&tcp_fastopen_ctx_lock);
-        if (octx)
-                call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
-        return err;
-}
-/* Computes the fastopen cookie for the peer.
- * The peer address is a 128 bits long (pad with zeros for IPv4).
- *
- * The caller must check foc->len to determine if a valid cookie
- * has been generated successfully.
-*/
-void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
-{
-        __be32 peer_addr[4] = { addr, 0, 0, 0 };
-        struct tcp_fastopen_context *ctx;
-        rcu_read_lock();
-        ctx = rcu_dereference(tcp_fastopen_ctx);
-        if (ctx) {
-                crypto_cipher_encrypt_one(ctx->tfm,
-                                          foc->val,
-                                          (__u8 *)peer_addr);
-                foc->len = TCP_FASTOPEN_COOKIE_SIZE;
-        }
-        rcu_read_unlock();
-}
-static int __init tcp_fastopen_init(void)
-{
-        __u8 key[TCP_FASTOPEN_KEY_LENGTH];
-        get_random_bytes(key, sizeof(key));
-        tcp_fastopen_reset_cipher(key, sizeof(key));
-        return 0;
-}
-late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 57bdd17dff4..fe3ecf484b4 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -15,7 +15,7 @@
 /* Tcp Hybla structure. */
 struct hybla {
-        bool  hybla_en;
+        u8    hybla_en;
        u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
        u32   rho;            /* Rho parameter, integer part  */
        u32   rho2;           /* Rho * Rho, integer part */
@@ -24,7 +24,8 @@ struct hybla {
        u32   minrtt;         /* Minimum smoothed round trip time value seen */
 };
-/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
+   expressed in jiffies */
 static int rtt0 = 25;
 module_param(rtt0, int, 0644);
 MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
@@ -38,7 +39,7 @@ static inline void hybla_recalc_param (struct sock *sk)
        ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
        ca->rho = ca->rho_3ls >> 3;
        ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
-        ca->rho2 = ca->rho2_7ls >> 7;
+        ca->rho2 = ca->rho2_7ls >>7;
 }
 static void hybla_init(struct sock *sk)
@@ -51,7 +52,7 @@ static void hybla_init(struct sock *sk)
        ca->rho_3ls = 0;
        ca->rho2_7ls = 0;
        ca->snd_cwnd_cents = 0;
-        ca->hybla_en = true;
+        ca->hybla_en = 1;
        tp->snd_cwnd = 2;
        tp->snd_cwnd_clamp = 65535;
@@ -66,7 +67,6 @@ static void hybla_init(struct sock *sk)
 static void hybla_state(struct sock *sk, u8 ca_state)
 {
        struct hybla *ca = inet_csk_ca(sk);
        ca->hybla_en = (ca_state == TCP_CA_Open);
 }
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 834857f3c87..813b43a76fe 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -313,13 +313,11 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
                        .tcpv_rttcnt = ca->cnt_rtt,
                        .tcpv_minrtt = ca->base_rtt,
                };
+                u64 t = ca->sum_rtt;
-                if (info.tcpv_rttcnt > 0) {
+                do_div(t, ca->cnt_rtt);
-                        u64 t = ca->sum_rtt;
+                info.tcpv_rtt = t;
-                        do_div(t, info.tcpv_rttcnt);
-                        info.tcpv_rtt = t;
-                }
                nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
        }
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 18f97ca76b0..d73aab3fbfc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,8 +61,6 @@
 *              Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
 */
-#define pr_fmt(fmt) "TCP: " fmt
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -85,23 +83,20 @@ int sysctl_tcp_ecn __read_mostly = 2;
 EXPORT_SYMBOL(sysctl_tcp_ecn);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
-int sysctl_tcp_adv_win_scale __read_mostly = 1;
+int sysctl_tcp_adv_win_scale __read_mostly = 2;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-/* rfc5961 challenge ack rate limiting */
-int sysctl_tcp_challenge_ack_limit = 100;
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
+int sysctl_tcp_nometrics_save __read_mostly;
 int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_abc __read_mostly;
-int sysctl_tcp_early_retrans __read_mostly = 2;
 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
@@ -110,6 +105,7 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
 #define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */
 #define FLAG_DATA_SACKED        0x20 /* New SACK.                               */
 #define FLAG_ECE                0x40 /* ECE in this ACK                         */
+#define FLAG_DATA_LOST          0x80 /* SACK detected data lossage.             */
 #define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ONLY_ORIG_SACKED   0x200 /* SACKs only non-rexmit sent before RTO */
 #define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -178,7 +174,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 static void tcp_incr_quickack(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
-        unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+        unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
        if (quickacks == 0)
                quickacks = 2;
@@ -198,10 +194,9 @@ static void tcp_enter_quickack_mode(struct sock *sk)
 * and the session is not interactive.
 */
-static inline bool tcp_in_quickack_mode(const struct sock *sk)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
 }
@@ -211,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
                tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
 }
-static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
 {
        if (tcp_hdr(skb)->cwr)
                tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
@@ -222,49 +217,36 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
        tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 }
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
 {
-        if (!(tp->ecn_flags & TCP_ECN_OK))
+        if (tp->ecn_flags & TCP_ECN_OK) {
-                return;
+                if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-        switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
-        case INET_ECN_NOT_ECT:
                /* Funny extension: if ECT is not set on a segment,
-                 * and we already seen ECT on a previous segment,
+                 * it is surely retransmit. It is not in ECN RFC,
-                 * it is probably a retransmit.
+                 * but Linux follows this rule. */
-                 */
+                else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
-                if (tp->ecn_flags & TCP_ECN_SEEN)
-                        tcp_enter_quickack_mode((struct sock *)tp);
-                break;
-        case INET_ECN_CE:
-                if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
-                        /* Better not delay acks, sender can have a very low cwnd */
                        tcp_enter_quickack_mode((struct sock *)tp);
-                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-                }
-                /* fallinto */
-        default:
-                tp->ecn_flags |= TCP_ECN_SEEN;
        }
 }
-static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
 {
        if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
                tp->ecn_flags &= ~TCP_ECN_OK;
 }
-static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
 {
        if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
                tp->ecn_flags &= ~TCP_ECN_OK;
 }
-static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
 {
        if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
-                return true;
+                return 1;
-        return false;
+        return 0;
 }
 /* Buffer size and advertised window tuning.
@@ -274,11 +256,14 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
 static void tcp_fixup_sndbuf(struct sock *sk)
 {
-        int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
+        int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
+                     sizeof(struct sk_buff);
-        sndmem *= TCP_INIT_CWND;
+        if (sk->sk_sndbuf < 3 * sndmem) {
-        if (sk->sk_sndbuf < sndmem)
+                sk->sk_sndbuf = 3 * sndmem;
-                sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+                if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
+                        sk->sk_sndbuf = sysctl_tcp_wmem[2];
+        }
 }
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -324,14 +309,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
        return 0;
 }
-static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        /* Check #1 */
        if (tp->rcv_ssthresh < tp->window_clamp &&
            (int)tp->rcv_ssthresh < tcp_space(sk) &&
-            !sk_under_memory_pressure(sk)) {
+            !tcp_memory_pressure) {
                int incr;
                /* Check #2. Increase window, if skb with such overhead
@@ -343,7 +328,6 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
                        incr = __tcp_grow_window(sk, skb);
                if (incr) {
-                        incr = max_t(int, incr, 2 * skb->len);
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
                                               tp->window_clamp);
                        inet_csk(sk)->icsk_ack.quick |= 1;
@@ -355,30 +339,23 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 static void tcp_fixup_rcvbuf(struct sock *sk)
 {
-        u32 mss = tcp_sk(sk)->advmss;
+        struct tcp_sock *tp = tcp_sk(sk);
-        u32 icwnd = TCP_DEFAULT_INIT_RCVWND;
+        int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
-        int rcvmem;
-        /* Limit to 10 segments if mss <= 1460,
+        /* Try to select rcvbuf so that 4 mss-sized segments
-         * or 14600/mss segments, with a minimum of two segments.
+         * will fit to window and corresponding skbs will fit to our rcvbuf.
+         * (was 3; 4 is minimum to allow fast retransmit to work.)
         */
-        if (mss > 1460)
+        while (tcp_win_from_space(rcvmem) < tp->advmss)
-                icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
-        rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
-        while (tcp_win_from_space(rcvmem) < mss)
                rcvmem += 128;
+        if (sk->sk_rcvbuf < 4 * rcvmem)
-        rcvmem *= icwnd;
+                sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
-        if (sk->sk_rcvbuf < rcvmem)
-                sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
 }
 /* 4. Try to fixup all. It is made immediately after connection enters
 *    established state.
 */
-void tcp_init_buffer_space(struct sock *sk)
+static void tcp_init_buffer_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int maxwin;
@@ -421,8 +398,8 @@ static void tcp_clamp_window(struct sock *sk)
        if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
-            !sk_under_memory_pressure(sk) &&
+            !tcp_memory_pressure &&
-            sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
+            atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
                                    sysctl_tcp_rmem[2]);
        }
@@ -439,7 +416,7 @@ static void tcp_clamp_window(struct sock *sk)
 */
 void tcp_initialize_rcv_mss(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
        hint = min(hint, tp->rcv_wnd / 2);
@@ -483,11 +460,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
                if (!win_dep) {
                        m -= (new_sample >> 3);
                        new_sample += m;
-                } else {
+                } else if (m < new_sample)
-                        m <<= 3;
+                        new_sample = m << 3;
-                        if (m < new_sample)
-                                new_sample = m;
-                }
        } else {
                /* No previous measure. */
                new_sample = m << 3;
@@ -503,7 +477,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
                goto new_measure;
        if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
                return;
-        tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
+        tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
 new_measure:
        tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
@@ -557,7 +531,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
                        space /= tp->advmss;
                        if (!space)
                                space = 1;
-                        rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+                        rcvmem = (tp->advmss + MAX_TCP_HEADER +
+                                  16 + sizeof(struct sk_buff));
                        while (tcp_win_from_space(rcvmem) < tp->advmss)
                                rcvmem += 128;
                        space *= rcvmem;
@@ -707,7 +682,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
-void tcp_set_rto(struct sock *sk)
+static inline void tcp_set_rto(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        /* Old crap is replaced with new one. 8)
@@ -734,7 +709,110 @@ void tcp_set_rto(struct sock *sk)
        tcp_bound_rto(sk);
 }
-__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
+/* Save metrics learned by this TCP session.
+   This function is called only, when TCP finishes successfully
+   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        if (sysctl_tcp_nometrics_save)
+                return;
+        dst_confirm(dst);
+        if (dst && (dst->flags & DST_HOST)) {
+                const struct inet_connection_sock *icsk = inet_csk(sk);
+                int m;
+                unsigned long rtt;
+                if (icsk->icsk_backoff || !tp->srtt) {
+                        /* This session failed to estimate rtt. Why?
+                         * Probably, no packets returned in time.
+                         * Reset our results.
+                         */
+                        if (!(dst_metric_locked(dst, RTAX_RTT)))
+                                dst_metric_set(dst, RTAX_RTT, 0);
+                        return;
+                }
+                rtt = dst_metric_rtt(dst, RTAX_RTT);
+                m = rtt - tp->srtt;
+                /* If newly calculated rtt larger than stored one,
+                 * store new one. Otherwise, use EWMA. Remember,
+                 * rtt overestimation is always better than underestimation.
+                 */
+                if (!(dst_metric_locked(dst, RTAX_RTT))) {
+                        if (m <= 0)
+                                set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
+                        else
+                                set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
+                }
+                if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+                        unsigned long var;
+                        if (m < 0)
+                                m = -m;
+                        /* Scale deviation to rttvar fixed point */
+                        m >>= 1;
+                        if (m < tp->mdev)
+                                m = tp->mdev;
+                        var = dst_metric_rtt(dst, RTAX_RTTVAR);
+                        if (m >= var)
+                                var = m;
+                        else
+                                var -= (var - m) >> 2;
+                        set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
+                }
+                if (tcp_in_initial_slowstart(tp)) {
+                        /* Slow start still did not finish. */
+                        if (dst_metric(dst, RTAX_SSTHRESH) &&
+                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+                            (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+                                dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
+                        if (!dst_metric_locked(dst, RTAX_CWND) &&
+                            tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+                                dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
+                } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+                           icsk->icsk_ca_state == TCP_CA_Open) {
+                        /* Cong. avoidance phase, cwnd is reliable. */
+                        if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+                                dst_metric_set(dst, RTAX_SSTHRESH,
+                                               max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+                        if (!dst_metric_locked(dst, RTAX_CWND))
+                                dst_metric_set(dst, RTAX_CWND,
+                                               (dst_metric(dst, RTAX_CWND) +
+                                                tp->snd_cwnd) >> 1);
+                } else {
+                        /* Else slow start did not finish, cwnd is non-sense,
+                           ssthresh may be also invalid.
+                         */
+                        if (!dst_metric_locked(dst, RTAX_CWND))
+                                dst_metric_set(dst, RTAX_CWND,
+                                               (dst_metric(dst, RTAX_CWND) +
+                                                tp->snd_ssthresh) >> 1);
+                        if (dst_metric(dst, RTAX_SSTHRESH) &&
+                            !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+                            tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
+                                dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
+                }
+                if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+                        if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+                            tp->reordering != sysctl_tcp_reordering)
+                                dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+                }
+        }
+}
+__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -743,22 +821,124 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
+/* Set slow start threshold and cwnd not falling to slow start */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        tp->prior_ssthresh = 0;
+        tp->bytes_acked = 0;
+        if (icsk->icsk_ca_state < TCP_CA_CWR) {
+                tp->undo_marker = 0;
+                if (set_ssthresh)
+                        tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+                tp->snd_cwnd = min(tp->snd_cwnd,
+                                   tcp_packets_in_flight(tp) + 1U);
+                tp->snd_cwnd_cnt = 0;
+                tp->high_seq = tp->snd_nxt;
+                tp->snd_cwnd_stamp = tcp_time_stamp;
+                TCP_ECN_queue_cwr(tp);
+                tcp_set_ca_state(sk, TCP_CA_CWR);
+        }
+}
 /*
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
 */
-void tcp_disable_fack(struct tcp_sock *tp)
+static void tcp_disable_fack(struct tcp_sock *tp)
 {
        /* RFC3517 uses different metric in lost marker => reset on change */
        if (tcp_is_fack(tp))
                tp->lost_skb_hint = NULL;
-        tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
+        tp->rx_opt.sack_ok &= ~2;
 }
 /* Take a notice that peer is sending D-SACKs */
 static void tcp_dsack_seen(struct tcp_sock *tp)
 {
-        tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+        tp->rx_opt.sack_ok |= 4;
+}
+/* Initialize metrics on socket. */
+static void tcp_init_metrics(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
+        if (dst == NULL)
+                goto reset;
+        dst_confirm(dst);
+        if (dst_metric_locked(dst, RTAX_CWND))
+                tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+        if (dst_metric(dst, RTAX_SSTHRESH)) {
+                tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
+        } else {
+                /* ssthresh may have been reduced unnecessarily during.
+                 * 3WHS. Restore it back to its initial default.
+                 */
+                tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+        }
+        if (dst_metric(dst, RTAX_REORDERING) &&
+            tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+                tcp_disable_fack(tp);
+                tp->reordering = dst_metric(dst, RTAX_REORDERING);
+        }
+        if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
+                goto reset;
+        /* Initial rtt is determined from SYN,SYN-ACK.
+         * The segment is small and rtt may appear much
+         * less than real one. Use per-dst memory
+         * to make it more realistic.
+         *
+         * A bit of theory. RTT is time passed after "normal" sized packet
+         * is sent until it is ACKed. In normal circumstances sending small
+         * packets force peer to delay ACKs and calculation is correct too.
+         * The algorithm is adaptive and, provided we follow specs, it
+         * NEVER underestimate RTT. BUT! If peer tries to make some clever
+         * tricks sort of "quick acks" for time long enough to decrease RTT
+         * to low value, and then abruptly stops to do it and starts to delay
+         * ACKs, wait for troubles.
+         */
+        if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
+                tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+                tp->rtt_seq = tp->snd_nxt;
+        }
+        if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
+                tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+        }
+        tcp_set_rto(sk);
+reset:
+        if (tp->srtt == 0) {
+                /* RFC2988bis: We've failed to get a valid RTT sample from
+                 * 3WHS. This is most likely due to retransmission,
+                 * including spurious one. Reset the RTO back to 3secs
+                 * from the more aggressive 1sec to avoid more spurious
+                 * retransmission.
+                 */
+                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+        }
+        /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+         * retransmitted. In light of RFC2988bis' more aggressive 1sec
+         * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+         * retransmission has occurred.
+         */
+        if (tp->total_retrans > 1)
+                tp->snd_cwnd = 1;
+        else
+                tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -782,18 +962,15 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
                NET_INC_STATS_BH(sock_net(sk), mib_idx);
 #if FASTRETRANS_DEBUG > 1
-                pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+                printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
-                         tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+                       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
-                         tp->reordering,
+                       tp->reordering,
-                         tp->fackets_out,
+                       tp->fackets_out,
-                         tp->sacked_out,
+                       tp->sacked_out,
-                         tp->undo_marker ? tp->undo_retrans : 0);
+                       tp->undo_marker ? tp->undo_retrans : 0);
 #endif
                tcp_disable_fack(tp);
        }
-        if (metric > 0)
-                tcp_disable_early_retrans(tp);
 }
 /* This must be called before lost_out is incremented */
@@ -851,11 +1028,13 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
- * 3. Loss detection event of two flavors:
+ * 3. Loss detection event of one of three flavors:
 *      A. Scoreboard estimator decided the packet is lost.
 *         A'. Reno "three dupacks" marks head of queue lost.
- *         A''. Its FACK modification, head until snd.fack is lost.
+ *         A''. Its FACK modfication, head until snd.fack is lost.
- *      B. SACK arrives sacking SND.NXT at the moment, when the
+ *      B. SACK arrives sacking data transmitted after never retransmitted
+ *         hole was sent out.
+ *      C. SACK arrives sacking SND.NXT at the moment, when the
 *         segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
@@ -924,36 +1103,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
 * the exact amount is rather hard to quantify. However, tp->max_window can
 * be used as an exaggerated estimate.
 */
-static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
-                                   u32 start_seq, u32 end_seq)
+                                  u32 start_seq, u32 end_seq)
 {
        /* Too far in future, or reversed (interpretation is ambiguous) */
        if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
-                return false;
+                return 0;
        /* Nasty start_seq wrap-around check (see comments above) */
        if (!before(start_seq, tp->snd_nxt))
-                return false;
+                return 0;
        /* In outstanding window? ...This is valid exit for D-SACKs too.
         * start_seq == snd_una is non-sensical (see comments above)
         */
        if (after(start_seq, tp->snd_una))
-                return true;
+                return 1;
        if (!is_dsack || !tp->undo_marker)
-                return false;
+                return 0;
        /* ...Then it's D-SACK, and must reside below snd_una completely */
        if (after(end_seq, tp->snd_una))
-                return false;
+                return 0;
        if (!before(start_seq, tp->undo_marker))
-                return true;
+                return 1;
        /* Too old */
        if (!after(end_seq, tp->undo_marker))
-                return false;
+                return 0;
        /* Undo_marker boundary crossing (overestimates a lot). Known already:
         *   start_seq < undo_marker and end_seq >= undo_marker.
@@ -962,7 +1141,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
 }
 /* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
+ * Event "C". Later note: FACK people cheated me again 8), we have to account
 * for reordering! Ugly, but should help.
 *
 * Search retransmitted skbs from write_queue that were sent when snd_nxt was
@@ -1025,17 +1204,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)
                tp->lost_retrans_low = new_low_seq;
 }
-static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
-                            struct tcp_sack_block_wire *sp, int num_sacks,
+                           struct tcp_sack_block_wire *sp, int num_sacks,
-                            u32 prior_snd_una)
+                           u32 prior_snd_una)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
        u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
-        bool dup_sack = false;
+        int dup_sack = 0;
        if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
-                dup_sack = true;
+                dup_sack = 1;
                tcp_dsack_seen(tp);
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
        } else if (num_sacks > 1) {
@@ -1044,7 +1223,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
                if (!after(end_seq_0, end_seq_1) &&
                    !before(start_seq_0, start_seq_1)) {
-                        dup_sack = true;
+                        dup_sack = 1;
                        tcp_dsack_seen(tp);
                        NET_INC_STATS_BH(sock_net(sk),
                                        LINUX_MIB_TCPDSACKOFORECV);
@@ -1075,10 +1254,9 @@ struct tcp_sacktag_state {
 * FIXME: this could be merged to shift decision code
 */
 static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
-                                  u32 start_seq, u32 end_seq)
+                                 u32 start_seq, u32 end_seq)
 {
-        int err;
+        int in_sack, err;
-        bool in_sack;
        unsigned int pkt_len;
        unsigned int mss;
@@ -1120,26 +1298,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
        return in_sack;
 }
-/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
-static u8 tcp_sacktag_one(struct sock *sk,
+                          struct tcp_sacktag_state *state,
-                          struct tcp_sacktag_state *state, u8 sacked,
+                          int dup_sack, int pcount)
-                          u32 start_seq, u32 end_seq,
-                          bool dup_sack, int pcount)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        u8 sacked = TCP_SKB_CB(skb)->sacked;
        int fack_count = state->fack_count;
        /* Account D-SACK for retransmitted packet. */
        if (dup_sack && (sacked & TCPCB_RETRANS)) {
                if (tp->undo_marker && tp->undo_retrans &&
-                    after(end_seq, tp->undo_marker))
+                    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
                        tp->undo_retrans--;
                if (sacked & TCPCB_SACKED_ACKED)
                        state->reord = min(fack_count, state->reord);
        }
        /* Nothing to do; acked frame is about to be dropped (was ACKed). */
-        if (!after(end_seq, tp->snd_una))
+        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
                return sacked;
        if (!(sacked & TCPCB_SACKED_ACKED)) {
@@ -1158,13 +1335,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
                                /* New sack for not retransmitted frame,
                                 * which was in hole. It is reordering.
                                 */
-                                if (before(start_seq,
+                                if (before(TCP_SKB_CB(skb)->seq,
                                           tcp_highest_sack_seq(tp)))
                                        state->reord = min(fack_count,
                                                           state->reord);
                                /* SACK enhanced F-RTO (RFC4138; Appendix B) */
-                                if (!after(end_seq, tp->frto_highmark))
+                                if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
                                        state->flag |= FLAG_ONLY_ORIG_SACKED;
                        }
@@ -1182,7 +1359,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
                /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
                if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
-                    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
+                    before(TCP_SKB_CB(skb)->seq,
+                           TCP_SKB_CB(tp->lost_skb_hint)->seq))
                        tp->lost_cnt_hint += pcount;
                if (fack_count > tp->fackets_out)
@@ -1201,30 +1379,16 @@ static u8 tcp_sacktag_one(struct sock *sk,
        return sacked;
 }
-/* Shift newly-SACKed bytes from this skb to the immediately previous
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
- * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+                           struct tcp_sacktag_state *state,
- */
+                           unsigned int pcount, int shifted, int mss,
-static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+                           int dup_sack)
-                            struct tcp_sacktag_state *state,
-                            unsigned int pcount, int shifted, int mss,
-                            bool dup_sack)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
-        u32 start_seq = TCP_SKB_CB(skb)->seq;   /* start of newly-SACKed */
-        u32 end_seq = start_seq + shifted;      /* end of newly-SACKed */
        BUG_ON(!pcount);
-        /* Adjust counters and hints for the newly sacked sequence
-         * range but discard the return value since prev is already
-         * marked. We must tag the range first because the seq
-         * advancement below implicitly advances
-         * tcp_highest_sack_seq() when skb is highest_sack.
-         */
-        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
-                        start_seq, end_seq, dup_sack, pcount);
        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;
@@ -1251,13 +1415,16 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
                skb_shinfo(skb)->gso_type = 0;
        }
+        /* We discard results */
+        tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
        /* Difference in this won't matter, both ACKed by the same cumul. ACK */
        TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
        if (skb->len > 0) {
                BUG_ON(!tcp_skb_pcount(skb));
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
-                return false;
+                return 0;
        }
        /* Whole SKB was eaten :-) */
@@ -1271,7 +1438,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
                tp->lost_cnt_hint -= tcp_skb_pcount(prev);
        }
-        TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags;
+        TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
        if (skb == tcp_highest_sack(sk))
                tcp_advance_highest_sack(sk, skb);
@@ -1280,19 +1447,19 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
-        return true;
+        return 1;
 }
 /* I wish gso_size would have a bit more sane initialization than
 * something-or-zero which complicates things
 */
-static int tcp_skb_seglen(const struct sk_buff *skb)
+static int tcp_skb_seglen(struct sk_buff *skb)
 {
        return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
 }
 /* Shifting pages past head area doesn't work */
-static int skb_can_shift(const struct sk_buff *skb)
+static int skb_can_shift(struct sk_buff *skb)
 {
        return !skb_headlen(skb) && skb_is_nonlinear(skb);
 }
@@ -1303,7 +1470,7 @@ static int skb_can_shift(const struct sk_buff *skb)
 static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
                                          struct tcp_sacktag_state *state,
                                          u32 start_seq, u32 end_seq,
-                                          bool dup_sack)
+                                          int dup_sack)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *prev;
@@ -1398,10 +1565,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
                }
        }
-        /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
-        if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
-                goto fallback;
        if (!skb_shift(prev, skb, len))
                goto fallback;
        if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
@@ -1442,14 +1605,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sack_block *next_dup,
                                        struct tcp_sacktag_state *state,
                                        u32 start_seq, u32 end_seq,
-                                        bool dup_sack_in)
+                                        int dup_sack_in)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *tmp;
        tcp_for_write_queue_from(skb, sk) {
                int in_sack = 0;
-                bool dup_sack = dup_sack_in;
+                int dup_sack = dup_sack_in;
                if (skb == tcp_send_head(sk))
                        break;
@@ -1464,7 +1627,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                        next_dup->start_seq,
                                                        next_dup->end_seq);
                        if (in_sack > 0)
-                                dup_sack = true;
+                                dup_sack = 1;
                }
                /* skb reference here is a bit tricky to get right, since
@@ -1492,14 +1655,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                        break;
                if (in_sack) {
-                        TCP_SKB_CB(skb)->sacked =
+                        TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
-                                tcp_sacktag_one(sk,
+                                                                  state,
-                                                state,
+                                                                  dup_sack,
-                                                TCP_SKB_CB(skb)->sacked,
+                                                                  tcp_skb_pcount(skb));
-                                                TCP_SKB_CB(skb)->seq,
-                                                TCP_SKB_CB(skb)->end_seq,
-                                                dup_sack,
-                                                tcp_skb_pcount(skb));
                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
@@ -1549,19 +1708,19 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
        return skb;
 }
-static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
+static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
 {
        return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
 }
 static int
-tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
                        u32 prior_snd_una)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
-        const unsigned char *ptr = (skb_transport_header(ack_skb) +
+        unsigned char *ptr = (skb_transport_header(ack_skb) +
-                                    TCP_SKB_CB(ack_skb)->sacked);
+                              TCP_SKB_CB(ack_skb)->sacked);
        struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
        struct tcp_sack_block sp[TCP_NUM_SACKS];
        struct tcp_sack_block *cache;
@@ -1569,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
        struct sk_buff *skb;
        int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
        int used_sacks;
-        bool found_dup_sack = false;
+        int found_dup_sack = 0;
        int i, j;
        int first_sack_index;
@@ -1600,7 +1759,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
        used_sacks = 0;
        first_sack_index = 0;
        for (i = 0; i < num_sacks; i++) {
-                bool dup_sack = !i && found_dup_sack;
+                int dup_sack = !i && found_dup_sack;
                sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
                sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
@@ -1667,12 +1826,16 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
        while (i < used_sacks) {
                u32 start_seq = sp[i].start_seq;
                u32 end_seq = sp[i].end_seq;
-                bool dup_sack = (found_dup_sack && (i == first_sack_index));
+                int dup_sack = (found_dup_sack && (i == first_sack_index));
                struct tcp_sack_block *next_dup = NULL;
                if (found_dup_sack && ((i + 1) == first_sack_index))
                        next_dup = &sp[i + 1];
+                /* Event "B" in the comment above. */
+                if (after(end_seq, tp->high_seq))
+                        state.flag |= FLAG_DATA_LOST;
                /* Skip too early cached blocks */
                while (tcp_sack_cache_ok(tp, cache) &&
                       !before(start_seq, cache->end_seq))
@@ -1769,9 +1932,9 @@ out:
 }
 /* Limits sacked_out so that sum with lost_out isn't ever larger than
- * packets_out. Returns false if sacked_out adjustement wasn't necessary.
+ * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
 */
-static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
+static int tcp_limit_reno_sacked(struct tcp_sock *tp)
 {
        u32 holes;
@@ -1780,9 +1943,9 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
        if ((tp->sacked_out + holes) > tp->packets_out) {
                tp->sacked_out = tp->packets_out - holes;
-                return true;
+                return 1;
        }
-        return false;
+        return 0;
 }
 /* If we receive more dupacks than we expected counting segments
@@ -1836,40 +1999,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp)
 /* F-RTO can only be used if TCP has never retransmitted anything other than
 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
 */
-bool tcp_use_frto(struct sock *sk)
+int tcp_use_frto(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb;
        if (!sysctl_tcp_frto)
-                return false;
+                return 0;
        /* MTU probe and F-RTO won't really play nicely along currently */
        if (icsk->icsk_mtup.probe_size)
-                return false;
+                return 0;
        if (tcp_is_sackfrto(tp))
-                return true;
+                return 1;
        /* Avoid expensive walking of rexmit queue if possible */
        if (tp->retrans_out > 1)
-                return false;
+                return 0;
        skb = tcp_write_queue_head(sk);
        if (tcp_skb_is_last(sk, skb))
-                return true;
+                return 1;
        skb = tcp_write_queue_next(sk, skb);    /* Skips head */
        tcp_for_write_queue_from(skb, sk) {
                if (skb == tcp_send_head(sk))
                        break;
                if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
-                        return false;
+                        return 0;
                /* Short-circuit when first non-SACKed skb has been checked */
                if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                        break;
        }
-        return true;
+        return 1;
 }
 /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
@@ -2105,7 +2268,7 @@ void tcp_enter_loss(struct sock *sk, int how)
 *
 * Do processing similar to RTO timeout.
 */
-static bool tcp_check_sack_reneging(struct sock *sk, int flag)
+static int tcp_check_sack_reneging(struct sock *sk, int flag)
 {
        if (flag & FLAG_SACK_RENEGING) {
                struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2116,12 +2279,12 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
                tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                          icsk->icsk_rto, TCP_RTO_MAX);
-                return true;
+                return 1;
        }
-        return false;
+        return 0;
 }
-static inline int tcp_fackets_out(const struct tcp_sock *tp)
+static inline int tcp_fackets_out(struct tcp_sock *tp)
 {
        return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
 }
@@ -2141,41 +2304,19 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
 * they differ. Since neither occurs due to loss, TCP should really
 * ignore them.
 */
-static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
+static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
 {
        return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
 }
-static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        unsigned long delay;
-        /* Delay early retransmit and entering fast recovery for
-         * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
-         * available, or RTO is scheduled to fire first.
-         */
-        if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
-                return false;
-        delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
-        if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
-                return false;
-        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
-        tp->early_retrans_delayed = 1;
-        return true;
-}
-static inline int tcp_skb_timedout(const struct sock *sk,
-                                   const struct sk_buff *skb)
 {
        return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
 }
-static inline int tcp_head_timedout(const struct sock *sk)
+static inline int tcp_head_timedout(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        return tp->packets_out &&
               tcp_skb_timedout(sk, tcp_write_queue_head(sk));
@@ -2274,28 +2415,28 @@ static inline int tcp_head_timedout(const struct sock *sk)
 * Main question: may we further continue forward transmission
 * with the same cwnd?
 */
-static bool tcp_time_to_recover(struct sock *sk, int flag)
+static int tcp_time_to_recover(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        __u32 packets_out;
        /* Do not perform any recovery during F-RTO algorithm */
        if (tp->frto_counter)
-                return false;
+                return 0;
        /* Trick#1: The loss is proven. */
        if (tp->lost_out)
-                return true;
+                return 1;
        /* Not-A-Trick#2 : Classic rule... */
        if (tcp_dupack_heuristics(tp) > tp->reordering)
-                return true;
+                return 1;
        /* Trick#3 : when we use RFC2988 timer restart, fast
         * retransmit can be triggered by timeout of queue head.
         */
        if (tcp_is_fack(tp) && tcp_head_timedout(sk))
-                return true;
+                return 1;
        /* Trick#4: It is still not OK... But will it be useful to delay
         * recovery more?
@@ -2307,7 +2448,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
                /* We have nothing to send. This connection is limited
                 * either by receiver window or by application.
                 */
-                return true;
+                return 1;
        }
        /* If a thin stream is detected, retransmit after first
@@ -2318,19 +2459,9 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
        if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
            tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
            tcp_is_sack(tp) && !tcp_send_head(sk))
-                return true;
+                return 1;
-        /* Trick#6: TCP early retransmit, per RFC5827.  To avoid spurious
+        return 0;
-         * retransmissions due to small network reorderings, we implement
-         * Mitigation A.3 in the RFC and delay the retransmission for a short
-         * interval if appropriate.
-         */
-        if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
-            (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
-            !tcp_may_send_now(sk))
-                return !tcp_pause_early_retransmit(sk, flag);
-        return false;
 }
 /* New heuristics: it is possible only after we switched to restart timer
@@ -2371,11 +2502,8 @@ static void tcp_timeout_skbs(struct sock *sk)
        tcp_verify_left_out(tp);
 }
-/* Detect loss in event "A" above by marking head of queue up as lost.
+/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
- * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * is against sacked "cnt", otherwise it's against facked "cnt"
- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
- * has at least tp->reordering SACKed seqments above it; "packets" refers to
- * the maximum SACKed segments to pass before reaching this limit.
 */
 static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
@@ -2384,8 +2512,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
        int cnt, oldcnt;
        int err;
        unsigned int mss;
-        /* Use SACK to deduce losses of new sequences sent during recovery */
-        const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
        WARN_ON(packets > tp->packets_out);
        if (tp->lost_skb_hint) {
@@ -2407,7 +2533,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
                tp->lost_skb_hint = skb;
                tp->lost_cnt_hint = cnt;
-                if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
+                if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
                        break;
                oldcnt = cnt;
@@ -2417,7 +2543,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
                if (cnt > packets) {
                        if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
-                            (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
                            (oldcnt >= packets))
                                break;
@@ -2470,10 +2595,39 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+        return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
+/* Decrease cwnd each second ack. */
+static void tcp_cwnd_down(struct sock *sk, int flag)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        int decr = tp->snd_cwnd_cnt + 1;
+        if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
+            (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
+                tp->snd_cwnd_cnt = decr & 1;
+                decr >>= 1;
+                if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
+                        tp->snd_cwnd -= decr;
+                tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
+                tp->snd_cwnd_stamp = tcp_time_stamp;
+        }
+}
 /* Nothing was retransmitted or returned timestamp is less
 * than timestamp of the first retransmission.
 */
-static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
+static inline int tcp_packet_delayed(struct tcp_sock *tp)
 {
        return !tp->retrans_stamp ||
                (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2489,22 +2643,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
        struct inet_sock *inet = inet_sk(sk);
        if (sk->sk_family == AF_INET) {
-                pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+                printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
-                         msg,
+                       msg,
-                         &inet->inet_daddr, ntohs(inet->inet_dport),
+                       &inet->inet_daddr, ntohs(inet->inet_dport),
-                         tp->snd_cwnd, tcp_left_out(tp),
+                       tp->snd_cwnd, tcp_left_out(tp),
-                         tp->snd_ssthresh, tp->prior_ssthresh,
+                       tp->snd_ssthresh, tp->prior_ssthresh,
-                         tp->packets_out);
+                       tp->packets_out);
-        }
+        }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        else if (sk->sk_family == AF_INET6) {
                struct ipv6_pinfo *np = inet6_sk(sk);
-                pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+                printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
-                         msg,
+                       msg,
-                         &np->daddr, ntohs(inet->inet_dport),
+                       &np->daddr, ntohs(inet->inet_dport),
-                         tp->snd_cwnd, tcp_left_out(tp),
+                       tp->snd_cwnd, tcp_left_out(tp),
-                         tp->snd_ssthresh, tp->prior_ssthresh,
+                       tp->snd_ssthresh, tp->prior_ssthresh,
-                         tp->packets_out);
+                       tp->packets_out);
        }
 #endif
 }
@@ -2534,13 +2688,13 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-static inline bool tcp_may_undo(const struct tcp_sock *tp)
+static inline int tcp_may_undo(struct tcp_sock *tp)
 {
        return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
 }
 /* People celebrate: "We love our President!" */
-static bool tcp_try_undo_recovery(struct sock *sk)
+static int tcp_try_undo_recovery(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2565,10 +2719,10 @@ static bool tcp_try_undo_recovery(struct sock *sk)
                 * is ACKed. For Reno it is MUST to prevent false
                 * fast retransmits (RFC2582). SACK TCP is safe. */
                tcp_moderate_cwnd(tp);
-                return true;
+                return 1;
        }
        tcp_set_ca_state(sk, TCP_CA_Open);
-        return false;
+        return 0;
 }
 /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
@@ -2598,19 +2752,19 @@ static void tcp_try_undo_dsack(struct sock *sk)
 * that successive retransmissions of a segment must not advance
 * retrans_stamp under any conditions.
 */
-static bool tcp_any_retrans_done(const struct sock *sk)
+static int tcp_any_retrans_done(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        if (tp->retrans_out)
-                return true;
+                return 1;
        skb = tcp_write_queue_head(sk);
        if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
-                return true;
+                return 1;
-        return false;
+        return 0;
 }
 /* Undo during fast recovery after partial ACK. */
@@ -2644,7 +2798,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
 }
 /* Undo during loss recovery after partial ACK. */
-static bool tcp_try_undo_loss(struct sock *sk)
+static int tcp_try_undo_loss(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2666,91 +2820,28 @@ static bool tcp_try_undo_loss(struct sock *sk)
                tp->undo_marker = 0;
                if (tcp_is_sack(tp))
                        tcp_set_ca_state(sk, TCP_CA_Open);
-                return true;
+                return 1;
-        }
-        return false;
-}
-/* The cwnd reduction in CWR and Recovery use the PRR algorithm
- * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
- * It computes the number of packets to send (sndcnt) based on packets newly
- * delivered:
- *   1) If the packets in flight is larger than ssthresh, PRR spreads the
- *      cwnd reductions across a full RTT.
- *   2) If packets in flight is lower than ssthresh (such as due to excess
- *      losses and/or application stalls), do not perform any further cwnd
- *      reductions, but instead slow start up to ssthresh.
- */
-static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->high_seq = tp->snd_nxt;
-        tp->bytes_acked = 0;
-        tp->snd_cwnd_cnt = 0;
-        tp->prior_cwnd = tp->snd_cwnd;
-        tp->prr_delivered = 0;
-        tp->prr_out = 0;
-        if (set_ssthresh)
-                tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
-        TCP_ECN_queue_cwr(tp);
-}
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
-                               int fast_rexmit)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int sndcnt = 0;
-        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
-        tp->prr_delivered += newly_acked_sacked;
-        if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
-                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
-                               tp->prior_cwnd - 1;
-                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
-        } else {
-                sndcnt = min_t(int, delta,
-                               max_t(int, tp->prr_delivered - tp->prr_out,
-                                     newly_acked_sacked) + 1);
        }
+        return 0;
-        sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
-        tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
 }
-static inline void tcp_end_cwnd_reduction(struct sock *sk)
+static inline void tcp_complete_cwr(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        /* Do not moderate cwnd if it's already undone in cwr or recovery */
-        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+        if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
-        if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
-            (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
                tp->snd_cwnd = tp->snd_ssthresh;
                tp->snd_cwnd_stamp = tcp_time_stamp;
        }
        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
-/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->prior_ssthresh = 0;
-        tp->bytes_acked = 0;
-        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
-                tp->undo_marker = 0;
-                tcp_init_cwnd_reduction(sk, set_ssthresh);
-                tcp_set_ca_state(sk, TCP_CA_CWR);
-        }
-}
 static void tcp_try_keep_open(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int state = TCP_CA_Open;
-        if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
+        if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
                state = TCP_CA_Disorder;
        if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2759,7 +2850,7 @@ static void tcp_try_keep_open(struct sock *sk)
        }
 }
-static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
+static void tcp_try_to_open(struct sock *sk, int flag)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2773,10 +2864,9 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                tcp_try_keep_open(sk);
-                if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+                tcp_moderate_cwnd(tp);
-                        tcp_moderate_cwnd(tp);
        } else {
-                tcp_cwnd_reduction(sk, newly_acked_sacked, 0);
+                tcp_cwnd_down(sk, flag);
        }
 }
@@ -2858,30 +2948,6 @@ void tcp_simple_retransmit(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_simple_retransmit);
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int mib_idx;
-        if (tcp_is_reno(tp))
-                mib_idx = LINUX_MIB_TCPRENORECOVERY;
-        else
-                mib_idx = LINUX_MIB_TCPSACKRECOVERY;
-        NET_INC_STATS_BH(sock_net(sk), mib_idx);
-        tp->prior_ssthresh = 0;
-        tp->undo_marker = tp->snd_una;
-        tp->undo_retrans = tp->retrans_out;
-        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
-                if (!ece_ack)
-                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
-                tcp_init_cwnd_reduction(sk, true);
-        }
-        tcp_set_ca_state(sk, TCP_CA_Recovery);
-}
 /* Process an event, which can update packets-in-flight not trivially.
 * Main goal of this function is to calculate new estimate for left_out,
 * taking into account both packets sitting in receiver's buffer and
@@ -2893,16 +2959,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 * It does _not_ decide what to send, it is made in function
 * tcp_xmit_retransmit_queue().
 */
-static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
+static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
-                                  int prior_sacked, bool is_dupack,
-                                  int flag)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
+        int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
        int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
                                    (tcp_fackets_out(tp) > tp->reordering));
-        int newly_acked_sacked = 0;
+        int fast_rexmit = 0, mib_idx;
-        int fast_rexmit = 0;
        if (WARN_ON(!tp->packets_out && tp->sacked_out))
                tp->sacked_out = 0;
@@ -2918,10 +2982,19 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
        if (tcp_check_sack_reneging(sk, flag))
                return;
-        /* C. Check consistency of the current state. */
+        /* C. Process data loss notification, provided it is valid. */
+        if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
+            before(tp->snd_una, tp->high_seq) &&
+            icsk->icsk_ca_state != TCP_CA_Open &&
+            tp->fackets_out > tp->reordering) {
+                tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
+        }
+        /* D. Check consistency of the current state. */
        tcp_verify_left_out(tp);
-        /* D. Check state exit conditions. State can be terminated
+        /* E. Check state exit conditions. State can be terminated
         *    when high_seq is ACKed. */
        if (icsk->icsk_ca_state == TCP_CA_Open) {
                WARN_ON(tp->retrans_out != 0);
@@ -2938,7 +3011,18 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        /* CWR is to be held something *above* high_seq
                         * is ACKed for CWR bit to reach receiver. */
                        if (tp->snd_una != tp->high_seq) {
-                                tcp_end_cwnd_reduction(sk);
+                                tcp_complete_cwr(sk);
+                                tcp_set_ca_state(sk, TCP_CA_Open);
+                        }
+                        break;
+                case TCP_CA_Disorder:
+                        tcp_try_undo_dsack(sk);
+                        if (!tp->undo_marker ||
+                            /* For SACK case do not Open to allow to undo
+                             * catching for all duplicate ACKs. */
+                            tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
+                                tp->undo_marker = 0;
                                tcp_set_ca_state(sk, TCP_CA_Open);
                        }
                        break;
@@ -2948,12 +3032,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                                tcp_reset_reno_sack(tp);
                        if (tcp_try_undo_recovery(sk))
                                return;
-                        tcp_end_cwnd_reduction(sk);
+                        tcp_complete_cwr(sk);
                        break;
                }
        }
-        /* E. Process state. */
+        /* F. Process state. */
        switch (icsk->icsk_ca_state) {
        case TCP_CA_Recovery:
                if (!(flag & FLAG_SND_UNA_ADVANCED)) {
@@ -2961,7 +3045,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                                tcp_add_reno_sack(sk);
                } else
                        do_lost = tcp_try_undo_partial(sk, pkts_acked);
-                newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
                break;
        case TCP_CA_Loss:
                if (flag & FLAG_DATA_ACKED)
@@ -2983,13 +3066,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                        if (is_dupack)
                                tcp_add_reno_sack(sk);
                }
-                newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
-                if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+                if (icsk->icsk_ca_state == TCP_CA_Disorder)
                        tcp_try_undo_dsack(sk);
-                if (!tcp_time_to_recover(sk, flag)) {
+                if (!tcp_time_to_recover(sk)) {
-                        tcp_try_to_open(sk, flag, newly_acked_sacked);
+                        tcp_try_to_open(sk, flag);
                        return;
                }
@@ -3005,13 +3087,35 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
                }
                /* Otherwise enter Recovery state */
-                tcp_enter_recovery(sk, (flag & FLAG_ECE));
+                if (tcp_is_reno(tp))
+                        mib_idx = LINUX_MIB_TCPRENORECOVERY;
+                else
+                        mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+                NET_INC_STATS_BH(sock_net(sk), mib_idx);
+                tp->high_seq = tp->snd_nxt;
+                tp->prior_ssthresh = 0;
+                tp->undo_marker = tp->snd_una;
+                tp->undo_retrans = tp->retrans_out;
+                if (icsk->icsk_ca_state < TCP_CA_CWR) {
+                        if (!(flag & FLAG_ECE))
+                                tp->prior_ssthresh = tcp_current_ssthresh(sk);
+                        tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+                        TCP_ECN_queue_cwr(tp);
+                }
+                tp->bytes_acked = 0;
+                tp->snd_cwnd_cnt = 0;
+                tcp_set_ca_state(sk, TCP_CA_Recovery);
                fast_rexmit = 1;
        }
        if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
                tcp_update_scoreboard(sk, fast_rexmit);
-        tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
+        tcp_cwnd_down(sk, flag);
        tcp_xmit_retransmit_queue(sk);
 }
@@ -3086,53 +3190,16 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 /* Restart timer after forward progress on connection.
 * RFC2988 recommends to restart timer to now+rto.
 */
-void tcp_rearm_rto(struct sock *sk)
+static void tcp_rearm_rto(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        /* If the retrans timer is currently being used by Fast Open
-         * for SYN-ACK retrans purpose, stay put.
-         */
-        if (tp->fastopen_rsk)
-                return;
        if (!tp->packets_out) {
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
-                u32 rto = inet_csk(sk)->icsk_rto;
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                /* Offset the time elapsed after installing regular RTO */
+                                          inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
-                if (tp->early_retrans_delayed) {
-                        struct sk_buff *skb = tcp_write_queue_head(sk);
-                        const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
-                        s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
-                        /* delta may not be positive if the socket is locked
-                         * when the delayed ER timer fires and is rescheduled.
-                         */
-                        if (delta > 0)
-                                rto = delta;
-                }
-                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
-                                          TCP_RTO_MAX);
        }
-        tp->early_retrans_delayed = 0;
-}
-/* This function is called when the delayed ER timer fires. TCP enters
- * fast recovery and performs fast-retransmit.
- */
-void tcp_resume_early_retransmit(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tcp_rearm_rto(sk);
-        /* Stop if ER is disabled after the delayed ER timer is scheduled */
-        if (!tp->do_early_retrans)
-                return;
-        tcp_enter_recovery(sk, false);
-        tcp_update_scoreboard(sk, 1);
-        tcp_xmit_retransmit_queue(sk);
 }
 /* If we get here, the whole TSO packet has not been acked. */
@@ -3167,7 +3234,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb;
        u32 now = tcp_time_stamp;
-        int fully_acked = true;
+        int fully_acked = 1;
        int flag = 0;
        u32 pkts_acked = 0;
        u32 reord = tp->packets_out;
@@ -3191,7 +3258,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                        if (!acked_pcount)
                                break;
-                        fully_acked = false;
+                        fully_acked = 0;
                } else {
                        acked_pcount = tcp_skb_pcount(skb);
                }
@@ -3229,7 +3296,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                 * connection startup slow start one packet too
                 * quickly.  This is severely frowned upon behavior.
                 */
-                if (!(scb->tcp_flags & TCPHDR_SYN)) {
+                if (!(scb->flags & TCPHDR_SYN)) {
                        flag |= FLAG_DATA_ACKED;
                } else {
                        flag |= FLAG_SYN_ACKED;
@@ -3308,18 +3375,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        if (!tp->packets_out && tcp_is_sack(tp)) {
                icsk = inet_csk(sk);
                if (tp->lost_out) {
-                        pr_debug("Leak l=%u %d\n",
+                        printk(KERN_DEBUG "Leak l=%u %d\n",
-                                 tp->lost_out, icsk->icsk_ca_state);
+                               tp->lost_out, icsk->icsk_ca_state);
                        tp->lost_out = 0;
                }
                if (tp->sacked_out) {
-                        pr_debug("Leak s=%u %d\n",
+                        printk(KERN_DEBUG "Leak s=%u %d\n",
-                                 tp->sacked_out, icsk->icsk_ca_state);
+                               tp->sacked_out, icsk->icsk_ca_state);
                        tp->sacked_out = 0;
                }
                if (tp->retrans_out) {
-                        pr_debug("Leak r=%u %d\n",
+                        printk(KERN_DEBUG "Leak r=%u %d\n",
-                                 tp->retrans_out, icsk->icsk_ca_state);
+                               tp->retrans_out, icsk->icsk_ca_state);
                        tp->retrans_out = 0;
                }
        }
@@ -3347,23 +3414,23 @@ static void tcp_ack_probe(struct sock *sk)
        }
 }
-static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
        return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
                inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
-static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-                !tcp_in_cwnd_reduction(sk);
+                !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
 }
 /* Check that window update is acceptable.
 * The function assumes that snd_una<=ack<=snd_next.
 */
-static inline bool tcp_may_update_window(const struct tcp_sock *tp,
+static inline int tcp_may_update_window(const struct tcp_sock *tp,
                                        const u32 ack, const u32 ack_seq,
                                        const u32 nwin)
 {
@@ -3377,7 +3444,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
 */
-static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
+static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
                                 u32 ack_seq)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -3425,9 +3492,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
 }
 /* A conservative spurious RTO response algorithm: reduce cwnd using
- * PRR and continue in congestion avoidance.
+ * rate halving and continue in congestion avoidance.
 */
-static void tcp_cwr_spur_to_response(struct sock *sk)
+static void tcp_ratehalving_spur_to_response(struct sock *sk)
 {
        tcp_enter_cwr(sk, 0);
 }
@@ -3435,7 +3502,7 @@ static void tcp_cwr_spur_to_response(struct sock *sk)
 static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 {
        if (flag & FLAG_ECE)
-                tcp_cwr_spur_to_response(sk);
+                tcp_ratehalving_spur_to_response(sk);
        else
                tcp_undo_cwr(sk, true);
 }
@@ -3470,7 +3537,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
 *     to prove that the RTO is indeed spurious. It transfers the control
 *     from F-RTO to the conventional RTO recovery
 */
-static bool tcp_process_frto(struct sock *sk, int flag)
+static int tcp_process_frto(struct sock *sk, int flag)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -3486,7 +3553,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
        if (!before(tp->snd_una, tp->frto_highmark)) {
                tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
-                return true;
+                return 1;
        }
        if (!tcp_is_sackfrto(tp)) {
@@ -3495,19 +3562,19 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                 * data, winupdate
                 */
                if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
-                        return true;
+                        return 1;
                if (!(flag & FLAG_DATA_ACKED)) {
                        tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
                                            flag);
-                        return true;
+                        return 1;
                }
        } else {
                if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
                        /* Prevent sending of new data. */
                        tp->snd_cwnd = min(tp->snd_cwnd,
                                           tcp_packets_in_flight(tp));
-                        return true;
+                        return 1;
                }
                if ((tp->frto_counter >= 2) &&
@@ -3517,10 +3584,10 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                        /* RFC4138 shortcoming (see comment above) */
                        if (!(flag & FLAG_FORWARD_PROGRESS) &&
                            (flag & FLAG_NOT_DUP))
-                                return true;
+                                return 1;
                        tcp_enter_frto_loss(sk, 3, flag);
-                        return true;
+                        return 1;
                }
        }
@@ -3532,7 +3599,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                if (!tcp_may_send_now(sk))
                        tcp_enter_frto_loss(sk, 2, flag);
-                return true;
+                return 1;
        } else {
                switch (sysctl_tcp_frto_response) {
                case 2:
@@ -3542,61 +3609,34 @@ static bool tcp_process_frto(struct sock *sk, int flag)
                        tcp_conservative_spur_to_response(tp);
                        break;
                default:
-                        tcp_cwr_spur_to_response(sk);
+                        tcp_ratehalving_spur_to_response(sk);
                        break;
                }
                tp->frto_counter = 0;
                tp->undo_marker = 0;
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
        }
-        return false;
+        return 0;
-}
-/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
-{
-        /* unprotected vars, we dont care of overwrites */
-        static u32 challenge_timestamp;
-        static unsigned int challenge_count;
-        u32 now = jiffies / HZ;
-        if (now != challenge_timestamp) {
-                challenge_timestamp = now;
-                challenge_count = 0;
-        }
-        if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
-                tcp_send_ack(sk);
-        }
 }
 /* This routine deals with incoming acks, but not outgoing ones. */
-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_snd_una = tp->snd_una;
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
-        bool is_dupack = false;
        u32 prior_in_flight;
        u32 prior_fackets;
        int prior_packets;
-        int prior_sacked = tp->sacked_out;
+        int frto_cwnd = 0;
-        int pkts_acked = 0;
-        bool frto_cwnd = false;
        /* If the ack is older than previous acks
         * then we can probably ignore it.
         */
-        if (before(ack, prior_snd_una)) {
+        if (before(ack, prior_snd_una))
-                /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
-                if (before(ack, prior_snd_una - tp->max_window)) {
-                        tcp_send_challenge_ack(sk);
-                        return -1;
-                }
                goto old_ack;
-        }
        /* If the ack includes data we haven't sent yet, discard
         * this segment (RFC793 Section 3.9).
@@ -3604,9 +3644,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (after(ack, tp->snd_nxt))
                goto invalid_ack;
-        if (tp->early_retrans_delayed)
-                tcp_rearm_rto(sk);
        if (after(ack, prior_snd_una))
                flag |= FLAG_SND_UNA_ADVANCED;
@@ -3664,8 +3701,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        /* See if we can take anything off of the retransmit queue. */
        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
-        pkts_acked = prior_packets - tp->packets_out;
        if (tp->frto_counter)
                frto_cwnd = tcp_process_frto(sk, flag);
        /* Guarantee sacktag reordering detection against wrap-arounds */
@@ -3677,26 +3712,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
                    tcp_may_raise_cwnd(sk, flag))
                        tcp_cong_avoid(sk, ack, prior_in_flight);
-                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+                tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
-                tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
+                                      flag);
-                                      is_dupack, flag);
        } else {
                if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
                        tcp_cong_avoid(sk, ack, prior_in_flight);
        }
-        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
+        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
-                struct dst_entry *dst = __sk_dst_get(sk);
+                dst_confirm(__sk_dst_get(sk));
-                if (dst)
-                        dst_confirm(dst);
-        }
        return 1;
 no_queue:
-        /* If data was DSACKed, see if we can undo a cwnd reduction. */
-        if (flag & FLAG_DSACKING_ACK)
-                tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
-                                      is_dupack, flag);
        /* If this ack opens up a zero window, clear backoff.  It was
         * being used to time the probes, and is probably far higher than
         * it needs to be for normal retransmission.
@@ -3710,13 +3738,10 @@ invalid_ack:
        return -1;
 old_ack:
-        /* If data was SACKed, tag it and see if we should send more data.
-         * If data was DSACKed, see if we can undo a cwnd reduction.
-         */
        if (TCP_SKB_CB(skb)->sacked) {
-                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+                tcp_sacktag_write_queue(sk, skb, prior_snd_una);
-                tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
+                if (icsk->icsk_ca_state == TCP_CA_Open)
-                                      is_dupack, flag);
+                        tcp_try_keep_open(sk);
        }
        SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3727,15 +3752,14 @@ old_ack:
 * But, this can also be called on packets in the established flow when
 * the fast version below fails.
 */
-void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
+void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
-                       const u8 **hvpp, int estab,
+                       u8 **hvpp, int estab)
-                       struct tcp_fastopen_cookie *foc)
 {
-        const unsigned char *ptr;
+        unsigned char *ptr;
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        int length = (th->doff * 4) - sizeof(struct tcphdr);
-        ptr = (const unsigned char *)(th + 1);
+        ptr = (unsigned char *)(th + 1);
        opt_rx->saw_tstamp = 0;
        while (length > 0) {
@@ -3772,9 +3796,10 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
                                        __u8 snd_wscale = *(__u8 *)ptr;
                                        opt_rx->wscale_ok = 1;
                                        if (snd_wscale > 14) {
-                                                net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+                                                if (net_ratelimit())
-                                                                     __func__,
+                                                        printk(KERN_INFO "tcp_parse_options: Illegal window "
-                                                                     snd_wscale);
+                                                               "scaling value %d >14 received.\n",
+                                                               snd_wscale);
                                                snd_wscale = 14;
                                        }
                                        opt_rx->snd_wscale = snd_wscale;
@@ -3792,7 +3817,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
                        case TCPOPT_SACK_PERM:
                                if (opsize == TCPOLEN_SACK_PERM && th->syn &&
                                    !estab && sysctl_tcp_sack) {
-                                        opt_rx->sack_ok = TCP_SACK_SEEN;
+                                        opt_rx->sack_ok = 1;
                                        tcp_sack_reset(opt_rx);
                                }
                                break;
@@ -3836,25 +3861,8 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
                                        break;
                                }
                                break;
-                        case TCPOPT_EXP:
-                                /* Fast Open option shares code 254 using a
-                                 * 16 bits magic number. It's valid only in
-                                 * SYN or SYN-ACK with an even size.
-                                 */
-                                if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
-                                    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
-                                    foc == NULL || !th->syn || (opsize & 1))
-                                        break;
-                                foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
-                                if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
-                                    foc->len <= TCP_FASTOPEN_COOKIE_MAX)
-                                        memcpy(foc->val, ptr + 2, foc->len);
-                                else if (foc->len != 0)
-                                        foc->len = -1;
-                                break;
                        }
                        ptr += opsize-2;
                        length -= opsize;
                }
@@ -3862,9 +3870,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
 }
 EXPORT_SYMBOL(tcp_parse_options);
-static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
 {
-        const __be32 *ptr = (const __be32 *)(th + 1);
+        __be32 *ptr = (__be32 *)(th + 1);
        if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
                          | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
@@ -3873,41 +3881,40 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
                tp->rx_opt.rcv_tsval = ntohl(*ptr);
                ++ptr;
                tp->rx_opt.rcv_tsecr = ntohl(*ptr);
-                return true;
+                return 1;
        }
-        return false;
+        return 0;
 }
 /* Fast parse options. This hopes to only see timestamps.
 * If it is wrong it falls back on tcp_parse_options().
 */
-static bool tcp_fast_parse_options(const struct sk_buff *skb,
+static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
-                                   const struct tcphdr *th,
+                                  struct tcp_sock *tp, u8 **hvpp)
-                                   struct tcp_sock *tp, const u8 **hvpp)
 {
        /* In the spirit of fast parsing, compare doff directly to constant
         * values.  Because equality is used, short doff can be ignored here.
         */
        if (th->doff == (sizeof(*th) / 4)) {
                tp->rx_opt.saw_tstamp = 0;
-                return false;
+                return 0;
        } else if (tp->rx_opt.tstamp_ok &&
                   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
                if (tcp_parse_aligned_timestamp(tp, th))
-                        return true;
+                        return 1;
        }
-        tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
+        tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
-        return true;
+        return 1;
 }
 #ifdef CONFIG_TCP_MD5SIG
 /*
 * Parse MD5 Signature option
 */
-const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+u8 *tcp_parse_md5sig_option(struct tcphdr *th)
 {
-        int length = (th->doff << 2) - sizeof(*th);
+        int length = (th->doff << 2) - sizeof (*th);
-        const u8 *ptr = (const u8 *)(th + 1);
+        u8 *ptr = (u8*)(th + 1);
        /* If the TCP option is too short, we can short cut */
        if (length < TCPOLEN_MD5SIG)
@@ -3984,8 +3991,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        u32 seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -4002,7 +4009,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
 }
-static inline bool tcp_paws_discard(const struct sock *sk,
+static inline int tcp_paws_discard(const struct sock *sk,
                                   const struct sk_buff *skb)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
@@ -4024,14 +4031,14 @@ static inline bool tcp_paws_discard(const struct sock *sk,
 * (borrowed from freebsd)
 */
-static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
 {
        return  !before(end_seq, tp->rcv_wup) &&
                !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
 }
 /* When we get a reset we do this. */
-void tcp_reset(struct sock *sk)
+static void tcp_reset(struct sock *sk)
 {
        /* We want the right error as BSD sees it (and indeed as we do). */
        switch (sk->sk_state) {
@@ -4069,7 +4076,7 @@ void tcp_reset(struct sock *sk)
 *
 *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 */
-static void tcp_fin(struct sock *sk)
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -4113,7 +4120,7 @@ static void tcp_fin(struct sock *sk)
                /* Only TCP_LISTEN and TCP_CLOSE are left, in these
                 * cases we should never reach this piece of code.
                 */
-                pr_err("%s: Impossible, sk->sk_state=%d\n",
+                printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
                       __func__, sk->sk_state);
                break;
        }
@@ -4138,7 +4145,7 @@ static void tcp_fin(struct sock *sk)
        }
 }
-static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
                                  u32 end_seq)
 {
        if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
@@ -4146,9 +4153,9 @@ static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
                        sp->start_seq = seq;
                if (after(end_seq, sp->end_seq))
                        sp->end_seq = end_seq;
-                return true;
+                return 1;
        }
-        return false;
+        return 0;
 }
 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
@@ -4181,7 +4188,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
                tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
 }
-static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
+static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -4340,258 +4347,37 @@ static void tcp_ofo_queue(struct sock *sk)
                __skb_queue_tail(&sk->sk_receive_queue, skb);
                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                if (tcp_hdr(skb)->fin)
-                        tcp_fin(sk);
+                        tcp_fin(skb, sk, tcp_hdr(skb));
        }
 }
-static bool tcp_prune_ofo_queue(struct sock *sk);
+static int tcp_prune_ofo_queue(struct sock *sk);
 static int tcp_prune_queue(struct sock *sk);
-static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
-                                 unsigned int size)
 {
        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-            !sk_rmem_schedule(sk, skb, size)) {
+            !sk_rmem_schedule(sk, size)) {
                if (tcp_prune_queue(sk) < 0)
                        return -1;
-                if (!sk_rmem_schedule(sk, skb, size)) {
+                if (!sk_rmem_schedule(sk, size)) {
                        if (!tcp_prune_ofo_queue(sk))
                                return -1;
-                        if (!sk_rmem_schedule(sk, skb, size))
+                        if (!sk_rmem_schedule(sk, size))
                                return -1;
                }
        }
        return 0;
 }
-/**
- * tcp_try_coalesce - try to merge skb to prior one
- * @sk: socket
- * @to: prior buffer
- * @from: buffer to add in queue
- * @fragstolen: pointer to boolean
- *
- * Before queueing skb @from after @to, try to merge them
- * to reduce overall memory use and queue lengths, if cost is small.
- * Packets in ofo or receive queues can stay a long time.
- * Better try to coalesce them right now to avoid future collapses.
- * Returns true if caller should free @from instead of queueing it
- */
-static bool tcp_try_coalesce(struct sock *sk,
-                             struct sk_buff *to,
-                             struct sk_buff *from,
-                             bool *fragstolen)
-{
-        int delta;
-        *fragstolen = false;
-        if (tcp_hdr(from)->fin)
-                return false;
-        /* Its possible this segment overlaps with prior segment in queue */
-        if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
-                return false;
-        if (!skb_try_coalesce(to, from, fragstolen, &delta))
-                return false;
-        atomic_add(delta, &sk->sk_rmem_alloc);
-        sk_mem_charge(sk, delta);
-        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
-        TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
-        TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
-        return true;
-}
-static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct sk_buff *skb1;
-        u32 seq, end_seq;
-        TCP_ECN_check_ce(tp, skb);
-        if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
-                __kfree_skb(skb);
-                return;
-        }
-        /* Disable header prediction. */
-        tp->pred_flags = 0;
-        inet_csk_schedule_ack(sk);
-        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
-        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
-        skb1 = skb_peek_tail(&tp->out_of_order_queue);
-        if (!skb1) {
-                /* Initial out of order segment, build 1 SACK. */
-                if (tcp_is_sack(tp)) {
-                        tp->rx_opt.num_sacks = 1;
-                        tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
-                        tp->selective_acks[0].end_seq =
-                                                TCP_SKB_CB(skb)->end_seq;
-                }
-                __skb_queue_head(&tp->out_of_order_queue, skb);
-                goto end;
-        }
-        seq = TCP_SKB_CB(skb)->seq;
-        end_seq = TCP_SKB_CB(skb)->end_seq;
-        if (seq == TCP_SKB_CB(skb1)->end_seq) {
-                bool fragstolen;
-                if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
-                        __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-                } else {
-                        kfree_skb_partial(skb, fragstolen);
-                        skb = NULL;
-                }
-                if (!tp->rx_opt.num_sacks ||
-                    tp->selective_acks[0].end_seq != seq)
-                        goto add_sack;
-                /* Common case: data arrive in order after hole. */
-                tp->selective_acks[0].end_seq = end_seq;
-                goto end;
-        }
-        /* Find place to insert this segment. */
-        while (1) {
-                if (!after(TCP_SKB_CB(skb1)->seq, seq))
-                        break;
-                if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
-                        skb1 = NULL;
-                        break;
-                }
-                skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
-        }
-        /* Do skb overlap to previous one? */
-        if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-                if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-                        /* All the bits are present. Drop. */
-                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-                        __kfree_skb(skb);
-                        skb = NULL;
-                        tcp_dsack_set(sk, seq, end_seq);
-                        goto add_sack;
-                }
-                if (after(seq, TCP_SKB_CB(skb1)->seq)) {
-                        /* Partial overlap. */
-                        tcp_dsack_set(sk, seq,
-                                      TCP_SKB_CB(skb1)->end_seq);
-                } else {
-                        if (skb_queue_is_first(&tp->out_of_order_queue,
-                                               skb1))
-                                skb1 = NULL;
-                        else
-                                skb1 = skb_queue_prev(
-                                        &tp->out_of_order_queue,
-                                        skb1);
-                }
-        }
-        if (!skb1)
-                __skb_queue_head(&tp->out_of_order_queue, skb);
-        else
-                __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
-        /* And clean segments covered by new one as whole. */
-        while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
-                skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
-                if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
-                        break;
-                if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
-                        tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
-                                         end_seq);
-                        break;
-                }
-                __skb_unlink(skb1, &tp->out_of_order_queue);
-                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
-                                 TCP_SKB_CB(skb1)->end_seq);
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-                __kfree_skb(skb1);
-        }
-add_sack:
-        if (tcp_is_sack(tp))
-                tcp_sack_new_ofo_skb(sk, seq, end_seq);
-end:
-        if (skb)
-                skb_set_owner_r(skb, sk);
-}
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
-                  bool *fragstolen)
-{
-        int eaten;
-        struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
-        __skb_pull(skb, hdrlen);
-        eaten = (tail &&
-                 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
-        tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-        if (!eaten) {
-                __skb_queue_tail(&sk->sk_receive_queue, skb);
-                skb_set_owner_r(skb, sk);
-        }
-        return eaten;
-}
-int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
-{
-        struct sk_buff *skb = NULL;
-        struct tcphdr *th;
-        bool fragstolen;
-        if (size == 0)
-                return 0;
-        skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
-        if (!skb)
-                goto err;
-        if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
-                goto err_free;
-        th = (struct tcphdr *)skb_put(skb, sizeof(*th));
-        skb_reset_transport_header(skb);
-        memset(th, 0, sizeof(*th));
-        if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
-                goto err_free;
-        TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
-        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
-        TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
-        if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
-                WARN_ON_ONCE(fragstolen); /* should not happen */
-                __kfree_skb(skb);
-        }
-        return size;
-err_free:
-        kfree_skb(skb);
-err:
-        return -ENOMEM;
-}
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        struct tcp_sock *tp = tcp_sk(sk);
        int eaten = -1;
-        bool fragstolen = false;
        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
                goto drop;
@@ -4633,16 +4419,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                if (eaten <= 0) {
 queue_and_out:
                        if (eaten < 0 &&
-                            tcp_try_rmem_schedule(sk, skb, skb->truesize))
+                            tcp_try_rmem_schedule(sk, skb->truesize))
                                goto drop;
-                        eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+                        skb_set_owner_r(skb, sk);
+                        __skb_queue_tail(&sk->sk_receive_queue, skb);
                }
                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                if (skb->len)
                        tcp_event_data_recv(sk, skb);
                if (th->fin)
-                        tcp_fin(sk);
+                        tcp_fin(skb, sk, th);
                if (!skb_queue_empty(&tp->out_of_order_queue)) {
                        tcp_ofo_queue(sk);
@@ -4660,8 +4447,8 @@ queue_and_out:
                tcp_fast_path_check(sk);
                if (eaten > 0)
-                        kfree_skb_partial(skb, fragstolen);
+                        __kfree_skb(skb);
-                if (!sock_flag(sk, SOCK_DEAD))
+                else if (!sock_flag(sk, SOCK_DEAD))
                        sk->sk_data_ready(sk, 0);
                return;
        }
@@ -4701,7 +4488,105 @@ drop:
                goto queue_and_out;
        }
-        tcp_data_queue_ofo(sk, skb);
+        TCP_ECN_check_ce(tp, skb);
+        if (tcp_try_rmem_schedule(sk, skb->truesize))
+                goto drop;
+        /* Disable header prediction. */
+        tp->pred_flags = 0;
+        inet_csk_schedule_ack(sk);
+        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+                   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+        skb_set_owner_r(skb, sk);
+        if (!skb_peek(&tp->out_of_order_queue)) {
+                /* Initial out of order segment, build 1 SACK. */
+                if (tcp_is_sack(tp)) {
+                        tp->rx_opt.num_sacks = 1;
+                        tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+                        tp->selective_acks[0].end_seq =
+                                                TCP_SKB_CB(skb)->end_seq;
+                }
+                __skb_queue_head(&tp->out_of_order_queue, skb);
+        } else {
+                struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
+                u32 seq = TCP_SKB_CB(skb)->seq;
+                u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+                if (seq == TCP_SKB_CB(skb1)->end_seq) {
+                        __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+                        if (!tp->rx_opt.num_sacks ||
+                            tp->selective_acks[0].end_seq != seq)
+                                goto add_sack;
+                        /* Common case: data arrive in order after hole. */
+                        tp->selective_acks[0].end_seq = end_seq;
+                        return;
+                }
+                /* Find place to insert this segment. */
+                while (1) {
+                        if (!after(TCP_SKB_CB(skb1)->seq, seq))
+                                break;
+                        if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+                                skb1 = NULL;
+                                break;
+                        }
+                        skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+                }
+                /* Do skb overlap to previous one? */
+                if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+                        if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                                /* All the bits are present. Drop. */
+                                __kfree_skb(skb);
+                                tcp_dsack_set(sk, seq, end_seq);
+                                goto add_sack;
+                        }
+                        if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+                                /* Partial overlap. */
+                                tcp_dsack_set(sk, seq,
+                                              TCP_SKB_CB(skb1)->end_seq);
+                        } else {
+                                if (skb_queue_is_first(&tp->out_of_order_queue,
+                                                       skb1))
+                                        skb1 = NULL;
+                                else
+                                        skb1 = skb_queue_prev(
+                                                &tp->out_of_order_queue,
+                                                skb1);
+                        }
+                }
+                if (!skb1)
+                        __skb_queue_head(&tp->out_of_order_queue, skb);
+                else
+                        __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+                /* And clean segments covered by new one as whole. */
+                while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+                        skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+                        if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+                                break;
+                        if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+                                                 end_seq);
+                                break;
+                        }
+                        __skb_unlink(skb1, &tp->out_of_order_queue);
+                        tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+                                         TCP_SKB_CB(skb1)->end_seq);
+                        __kfree_skb(skb1);
+                }
+add_sack:
+                if (tcp_is_sack(tp))
+                        tcp_sack_new_ofo_skb(sk, seq, end_seq);
+        }
 }
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4880,10 +4765,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 * Purge the out-of-order queue.
 * Return true if queue was pruned.
 */
-static bool tcp_prune_ofo_queue(struct sock *sk)
+static int tcp_prune_ofo_queue(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        bool res = false;
+        int res = 0;
        if (!skb_queue_empty(&tp->out_of_order_queue)) {
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
@@ -4897,7 +4782,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
                if (tp->rx_opt.sack_ok)
                        tcp_sack_reset(&tp->rx_opt);
                sk_mem_reclaim(sk);
-                res = true;
+                res = 1;
        }
        return res;
 }
@@ -4919,7 +4804,7 @@ static int tcp_prune_queue(struct sock *sk)
        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                tcp_clamp_window(sk);
-        else if (sk_under_memory_pressure(sk))
+        else if (tcp_memory_pressure)
                tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
        tcp_collapse_ofo_queue(sk);
@@ -4974,29 +4859,29 @@ void tcp_cwnd_application_limited(struct sock *sk)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-static bool tcp_should_expand_sndbuf(const struct sock *sk)
+static int tcp_should_expand_sndbuf(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        /* If the user specified a specific send buffer setting, do
         * not modify it.
         */
        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
-                return false;
+                return 0;
        /* If we are under global TCP memory pressure, do not expand.  */
-        if (sk_under_memory_pressure(sk))
+        if (tcp_memory_pressure)
-                return false;
+                return 0;
        /* If we are under soft global TCP memory pressure, do not expand.  */
-        if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+        if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
-                return false;
+                return 0;
        /* If we filled the congestion window, do not expand.  */
        if (tp->packets_out >= tp->snd_cwnd)
-                return false;
+                return 0;
-        return true;
+        return 1;
 }
 /* When incoming ACK allowed to free some skb from write_queue,
@@ -5010,10 +4895,8 @@ static void tcp_new_space(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        if (tcp_should_expand_sndbuf(sk)) {
-                int sndmem = SKB_TRUESIZE(max_t(u32,
+                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
-                                                tp->rx_opt.mss_clamp,
+                        MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
-                                                tp->mss_cache) +
-                                          MAX_TCP_HEADER);
                int demanded = max_t(unsigned int, tp->snd_cwnd,
                                     tp->reordering + 1);
                sndmem *= 2 * demanded;
@@ -5085,7 +4968,7 @@ static inline void tcp_ack_snd_check(struct sock *sk)
 *      either form (or just set the sysctl tcp_stdurg).
 */
-static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
+static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 ptr = ntohs(th->urg_ptr);
@@ -5151,7 +5034,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
 }
 /* This is the 'fast' part of urgent handling. */
-static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -5214,7 +5097,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
        return result;
 }
-static inline bool tcp_checksum_complete_user(struct sock *sk,
+static inline int tcp_checksum_complete_user(struct sock *sk,
                                             struct sk_buff *skb)
 {
        return !skb_csum_unnecessary(skb) &&
@@ -5222,19 +5105,19 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
 }
 #ifdef CONFIG_NET_DMA
-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
                                  int hlen)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int chunk = skb->len - hlen;
        int dma_cookie;
-        bool copied_early = false;
+        int copied_early = 0;
        if (tp->ucopy.wakeup)
-                return false;
+                return 0;
        if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-                tp->ucopy.dma_chan = net_dma_find_channel();
+                tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
        if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
@@ -5247,7 +5130,7 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
                        goto out;
                tp->ucopy.dma_cookie = dma_cookie;
-                copied_early = true;
+                copied_early = 1;
                tp->ucopy.len -= chunk;
                tp->copied_seq += chunk;
@@ -5271,10 +5154,10 @@ out:
 /* Does PAWS and seqno based validation of an incoming segment, flags will
 * play significant role here.
 */
-static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
-                                  const struct tcphdr *th, int syn_inerr)
+                              struct tcphdr *th, int syn_inerr)
 {
-        const u8 *hash_location;
+        u8 *hash_location;
        struct tcp_sock *tp = tcp_sk(sk);
        /* RFC1323: H1. Apply PAWS check first. */
@@ -5297,48 +5180,38 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
                 * an acknowledgment should be sent in reply (unless the RST
                 * bit is set, if so drop the segment and return)".
                 */
-                if (!th->rst) {
+                if (!th->rst)
-                        if (th->syn)
-                                goto syn_challenge;
                        tcp_send_dupack(sk, skb);
-                }
                goto discard;
        }
        /* Step 2: check RST bit */
        if (th->rst) {
-                /* RFC 5961 3.2 :
+                tcp_reset(sk);
-                 * If sequence number exactly matches RCV.NXT, then
-                 *     RESET the connection
-                 * else
-                 *     Send a challenge ACK
-                 */
-                if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
-                        tcp_reset(sk);
-                else
-                        tcp_send_challenge_ack(sk);
                goto discard;
        }
+        /* ts_recent update must be made after we are sure that the packet
+         * is in window.
+         */
+        tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
        /* step 3: check security and precedence [ignored] */
-        /* step 4: Check for a SYN
+        /* step 4: Check for a SYN in window. */
-         * RFC 5691 4.2 : Send a challenge ack
+        if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
-         */
-        if (th->syn) {
-syn_challenge:
                if (syn_inerr)
                        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
-                tcp_send_challenge_ack(sk);
+                tcp_reset(sk);
-                goto discard;
+                return -1;
        }
-        return true;
+        return 1;
 discard:
        __kfree_skb(skb);
-        return false;
+        return 0;
 }
 /*
@@ -5365,12 +5238,11 @@ discard:
 *      tcp_data_queue when everything is OK.
 */
 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-                        const struct tcphdr *th, unsigned int len)
+                        struct tcphdr *th, unsigned len)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        int res;
-        if (unlikely(sk->sk_rx_dst == NULL))
-                inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
        /*
         *      Header prediction.
         *      The code loosely follows the one in the famous
@@ -5450,14 +5322,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                } else {
                        int eaten = 0;
                        int copied_early = 0;
-                        bool fragstolen = false;
                        if (tp->copied_seq == tp->rcv_nxt &&
                            len - tcp_header_len <= tp->ucopy.len) {
 #ifdef CONFIG_NET_DMA
-                                if (tp->ucopy.task == current &&
+                                if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
-                                    sock_owned_by_user(sk) &&
-                                    tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
                                        copied_early = 1;
                                        eaten = 1;
                                }
@@ -5510,8 +5379,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
                                /* Bulk data transfer: receiver */
-                                eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+                                __skb_pull(skb, tcp_header_len);
-                                                      &fragstolen);
+                                __skb_queue_tail(&sk->sk_receive_queue, skb);
+                                skb_set_owner_r(skb, sk);
+                                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                        }
                        tcp_event_data_recv(sk, skb);
@@ -5533,8 +5404,9 @@ no_ack:
                        else
 #endif
                        if (eaten)
-                                kfree_skb_partial(skb, fragstolen);
+                                __kfree_skb(skb);
-                        sk->sk_data_ready(sk, 0);
+                        else
+                                sk->sk_data_ready(sk, 0);
                        return 0;
                }
        }
@@ -5543,25 +5415,18 @@ slow_path:
        if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
                goto csum_error;
-        if (!th->ack && !th->rst)
-                goto discard;
        /*
         *      Standard slow path.
         */
-        if (!tcp_validate_incoming(sk, skb, th, 1))
+        res = tcp_validate_incoming(sk, skb, th, 1);
-                return 0;
+        if (res <= 0)
+                return -res;
 step5:
-        if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+        if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
                goto discard;
-        /* ts_recent update must be made after we are sure that the packet
-         * is in window.
-         */
-        tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
        tcp_rcv_rtt_measure_ts(sk, skb);
        /* Process urgent data. */
@@ -5583,101 +5448,16 @@ discard:
 }
 EXPORT_SYMBOL(tcp_rcv_established);
-void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct inet_connection_sock *icsk = inet_csk(sk);
-        tcp_set_state(sk, TCP_ESTABLISHED);
-        if (skb != NULL) {
-                icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
-                security_inet_conn_established(sk, skb);
-        }
-        /* Make sure socket is routed, for correct metrics.  */
-        icsk->icsk_af_ops->rebuild_header(sk);
-        tcp_init_metrics(sk);
-        tcp_init_congestion_control(sk);
-        /* Prevent spurious tcp_cwnd_restart() on first data
-         * packet.
-         */
-        tp->lsndtime = tcp_time_stamp;
-        tcp_init_buffer_space(sk);
-        if (sock_flag(sk, SOCK_KEEPOPEN))
-                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-        if (!tp->rx_opt.snd_wscale)
-                __tcp_fast_path_on(tp, tp->snd_wnd);
-        else
-                tp->pred_flags = 0;
-        if (!sock_flag(sk, SOCK_DEAD)) {
-                sk->sk_state_change(sk);
-                sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
-        }
-}
-static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
-                                    struct tcp_fastopen_cookie *cookie)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
-        u16 mss = tp->rx_opt.mss_clamp;
-        bool syn_drop;
-        if (mss == tp->rx_opt.user_mss) {
-                struct tcp_options_received opt;
-                const u8 *hash_location;
-                /* Get original SYNACK MSS value if user MSS sets mss_clamp */
-                tcp_clear_options(&opt);
-                opt.user_mss = opt.mss_clamp = 0;
-                tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
-                mss = opt.mss_clamp;
-        }
-        if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */
-                cookie->len = -1;
-        /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
-         * the remote receives only the retransmitted (regular) SYNs: either
-         * the original SYN-data or the corresponding SYN-ACK is lost.
-         */
-        syn_drop = (cookie->len <= 0 && data &&
-                    inet_csk(sk)->icsk_retransmits);
-        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
-        if (data) { /* Retransmit unacked data in SYN */
-                tcp_for_write_queue_from(data, sk) {
-                        if (data == tcp_send_head(sk) ||
-                            __tcp_retransmit_skb(sk, data))
-                                break;
-                }
-                tcp_rearm_rto(sk);
-                return true;
-        }
-        tp->syn_data_acked = tp->syn_data;
-        return false;
-}
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
-                                         const struct tcphdr *th, unsigned int len)
+                                         struct tcphdr *th, unsigned len)
 {
-        const u8 *hash_location;
+        u8 *hash_location;
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_cookie_values *cvp = tp->cookie_values;
-        struct tcp_fastopen_cookie foc = { .len = -1 };
        int saved_clamp = tp->rx_opt.mss_clamp;
-        tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
+        tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
        if (th->ack) {
                /* rfc793:
@@ -5687,9 +5467,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
                 *        a reset (unless the RST bit is set, if so drop
                 *        the segment and return)"
+                 *
+                 *  We do not send data with SYN, so that RFC-correct
+                 *  test reduces to:
                 */
-                if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+                if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
-                    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
                        goto reset_and_undo;
                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5731,7 +5513,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                TCP_ECN_rcv_synack(tp, th);
-                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
                tcp_ack(sk, skb, FLAG_SLOWPATH);
                /* Ok.. it's good. Set up sequence numbers and
@@ -5744,6 +5526,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 * never scaled.
                 */
                tp->snd_wnd = ntohs(th->window);
+                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                if (!tp->rx_opt.wscale_ok) {
                        tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5797,12 +5580,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                }
                smp_mb();
+                tcp_set_state(sk, TCP_ESTABLISHED);
-                tcp_finish_connect(sk, skb);
+                security_inet_conn_established(sk, skb);
-                if ((tp->syn_fastopen || tp->syn_data) &&
+                /* Make sure socket is routed, for correct metrics.  */
-                    tcp_rcv_fastopen_synack(sk, skb, &foc))
+                icsk->icsk_af_ops->rebuild_header(sk);
-                        return -1;
+                tcp_init_metrics(sk);
+                tcp_init_congestion_control(sk);
+                /* Prevent spurious tcp_cwnd_restart() on first data
+                 * packet.
+                 */
+                tp->lsndtime = tcp_time_stamp;
+                tcp_init_buffer_space(sk);
+                if (sock_flag(sk, SOCK_KEEPOPEN))
+                        inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+                if (!tp->rx_opt.snd_wscale)
+                        __tcp_fast_path_on(tp, tp->snd_wnd);
+                else
+                        tp->pred_flags = 0;
+                if (!sock_flag(sk, SOCK_DEAD)) {
+                        sk->sk_state_change(sk);
+                        sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+                }
                if (sk->sk_write_pending ||
                    icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5816,6 +5623,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                         */
                        inet_csk_schedule_ack(sk);
                        icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+                        icsk->icsk_ack.ato       = TCP_ATO_MIN;
+                        tcp_incr_quickack(sk);
                        tcp_enter_quickack_mode(sk);
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
                                                  TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5881,9 +5690,7 @@ discard:
                tcp_send_synack(sk);
 #if 0
                /* Note, we could accept data and URG from this segment.
-                 * There are no obstacles to make this (except that we must
+                 * There are no obstacles to make this.
-                 * either change tcp_recvmsg() to prevent it from returning data
-                 * before 3WHS completes per RFC793, or employ TCP Fast Open).
                 *
                 * However, if we ignore data in ACKless segments sometimes,
                 * we have no reasons to accept it sometimes.
@@ -5919,12 +5726,12 @@ reset_and_undo:
 */
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
-                          const struct tcphdr *th, unsigned int len)
+                          struct tcphdr *th, unsigned len)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
-        struct request_sock *req;
        int queued = 0;
+        int res;
        tp->rx_opt.saw_tstamp = 0;
@@ -5940,8 +5747,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        goto discard;
                if (th->syn) {
-                        if (th->fin)
-                                goto discard;
                        if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
                                return 1;
@@ -5979,47 +5784,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                return 0;
        }
-        req = tp->fastopen_rsk;
+        res = tcp_validate_incoming(sk, skb, th, 0);
-        if (req != NULL) {
+        if (res <= 0)
-                WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+                return -res;
-                    sk->sk_state != TCP_FIN_WAIT1);
-                if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
-                        goto discard;
-        }
-        if (!th->ack && !th->rst)
-                goto discard;
-        if (!tcp_validate_incoming(sk, skb, th, 0))
-                return 0;
        /* step 5: check the ACK field */
-        if (true) {
+        if (th->ack) {
                int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
                switch (sk->sk_state) {
                case TCP_SYN_RECV:
                        if (acceptable) {
-                                /* Once we leave TCP_SYN_RECV, we no longer
+                                tp->copied_seq = tp->rcv_nxt;
-                                 * need req so release it.
-                                 */
-                                if (req) {
-                                        tcp_synack_rtt_meas(sk, req);
-                                        tp->total_retrans = req->num_retrans;
-                                        reqsk_fastopen_remove(sk, req, false);
-                                } else {
-                                        /* Make sure socket is routed, for
-                                         * correct metrics.
-                                         */
-                                        icsk->icsk_af_ops->rebuild_header(sk);
-                                        tcp_init_congestion_control(sk);
-                                        tcp_mtup_init(sk);
-                                        tcp_init_buffer_space(sk);
-                                        tp->copied_seq = tp->rcv_nxt;
-                                }
                                smp_mb();
                                tcp_set_state(sk, TCP_ESTABLISHED);
                                sk->sk_state_change(sk);
@@ -6041,27 +5817,23 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-                                if (req) {
+                                /* Make sure socket is routed, for
-                                        /* Re-arm the timer because data may
+                                 * correct metrics.
-                                         * have been sent out. This is similar
+                                 */
-                                         * to the regular data transmission case
+                                icsk->icsk_af_ops->rebuild_header(sk);
-                                         * when new data has just been ack'ed.
-                                         *
+                                tcp_init_metrics(sk);
-                                         * (TFO) - we could try to be more
-                                         * aggressive and retranmitting any data
+                                tcp_init_congestion_control(sk);
-                                         * sooner based on when they were sent
-                                         * out.
-                                         */
-                                        tcp_rearm_rto(sk);
-                                } else
-                                        tcp_init_metrics(sk);
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
                                 */
                                tp->lsndtime = tcp_time_stamp;
+                                tcp_mtup_init(sk);
                                tcp_initialize_rcv_mss(sk);
+                                tcp_init_buffer_space(sk);
                                tcp_fast_path_on(tp);
                        } else {
                                return 1;
@@ -6069,33 +5841,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        break;
                case TCP_FIN_WAIT1:
-                        /* If we enter the TCP_FIN_WAIT1 state and we are a
-                         * Fast Open socket and this is the first acceptable
-                         * ACK we have received, this would have acknowledged
-                         * our SYNACK so stop the SYNACK timer.
-                         */
-                        if (req != NULL) {
-                                /* Return RST if ack_seq is invalid.
-                                 * Note that RFC793 only says to generate a
-                                 * DUPACK for it but for TCP Fast Open it seems
-                                 * better to treat this case like TCP_SYN_RECV
-                                 * above.
-                                 */
-                                if (!acceptable)
-                                        return 1;
-                                /* We no longer need the request sock. */
-                                reqsk_fastopen_remove(sk, req, false);
-                                tcp_rearm_rto(sk);
-                        }
                        if (tp->snd_una == tp->write_seq) {
-                                struct dst_entry *dst;
                                tcp_set_state(sk, TCP_FIN_WAIT2);
                                sk->sk_shutdown |= SEND_SHUTDOWN;
+                                dst_confirm(__sk_dst_get(sk));
-                                dst = __sk_dst_get(sk);
-                                if (dst)
-                                        dst_confirm(dst);
                                if (!sock_flag(sk, SOCK_DEAD))
                                        /* Wake up lingering close() */
@@ -6145,12 +5894,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        }
                        break;
                }
-        }
+        } else
+                goto discard;
-        /* ts_recent update must be made after we are sure that the packet
-         * is in window.
-         */
-        tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
        /* step 6: check the URG bit */
        tcp_urg(sk, skb, th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 54139fa514e..6cdf6a28f6b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -50,7 +50,6 @@
 *                                      a single port at the same time.
 */
-#define pr_fmt(fmt) "TCP: " fmt
 #include <linux/bottom_half.h>
 #include <linux/types.h>
@@ -74,7 +73,6 @@
 #include <net/xfrm.h>
 #include <net/netdma.h>
 #include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -91,14 +89,22 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency);
 #ifdef CONFIG_TCP_MD5SIG
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
-                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
+                                                   __be32 addr);
+static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+                               __be32 daddr, __be32 saddr, struct tcphdr *th);
+#else
+static inline
+struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
+{
+        return NULL;
+}
 #endif
 struct inet_hashinfo tcp_hashinfo;
 EXPORT_SYMBOL(tcp_hashinfo);
-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 {
        return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
                                          ip_hdr(skb)->saddr,
@@ -196,13 +202,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                /* Reset inherited state */
                tp->rx_opt.ts_recent       = 0;
                tp->rx_opt.ts_recent_stamp = 0;
-                if (likely(!tp->repair))
+                tp->write_seq              = 0;
-                        tp->write_seq      = 0;
        }
        if (tcp_death_row.sysctl_tw_recycle &&
-            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
-                tcp_fetch_timewait_stamp(sk, &rt->dst);
+                struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
+                /*
+                 * VJ's idea. We save last timestamp seen from
+                 * the destination in peer table, when entering state
+                 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
+                 * when trying new connection.
+                 */
+                if (peer) {
+                        inet_peer_refcheck(peer);
+                        if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
+                                tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+                                tp->rx_opt.ts_recent = peer->tcp_ts;
+                        }
+                }
+        }
        inet->inet_dport = usin->sin_port;
        inet->inet_daddr = daddr;
@@ -234,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->dst);
-        if (!tp->write_seq && likely(!tp->repair))
+        if (!tp->write_seq)
                tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
                                                           inet->inet_daddr,
                                                           inet->inet_sport,
@@ -243,7 +262,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        inet->inet_id = tp->write_seq ^ jiffies;
        err = tcp_connect(sk);
        rt = NULL;
        if (err)
                goto failure;
@@ -264,15 +282,12 @@ failure:
 EXPORT_SYMBOL(tcp_v4_connect);
 /*
- * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * This routine does path mtu discovery as defined in RFC1191.
- * It can be called through tcp_release_cb() if socket was owned by user
- * at the time tcp_v4_err() was called to handle ICMP message.
 */
-static void tcp_v4_mtu_reduced(struct sock *sk)
+static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 {
        struct dst_entry *dst;
        struct inet_sock *inet = inet_sk(sk);
-        u32 mtu = tcp_sk(sk)->mtu_info;
        /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
         * send out by Linux are always <576bytes so they should go through
@@ -281,10 +296,17 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
        if (sk->sk_state == TCP_LISTEN)
                return;
-        dst = inet_csk_update_pmtu(sk, mtu);
+        /* We don't check in the destentry if pmtu discovery is forbidden
-        if (!dst)
+         * on this route. We just assume that no packet_to_big packets
+         * are send back when pmtu discovery is not active.
+         * There is a small race when the user changes this flag in the
+         * route, but I think that's acceptable.
+         */
+        if ((dst = __sk_dst_check(sk, 0)) == NULL)
                return;
+        dst->ops->update_pmtu(dst, mtu);
        /* Something is about to be wrong... Remember soft error
         * for the case, if this connection will not able to recover.
         */
@@ -306,14 +328,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
        } /* else let the usual retransmit timer handle it */
 }
-static void do_redirect(struct sk_buff *skb, struct sock *sk)
-{
-        struct dst_entry *dst = __sk_dst_check(sk, 0);
-        if (dst)
-                dst->ops->redirect(dst, sk, skb);
-}
 /*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
@@ -341,7 +355,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        const int code = icmp_hdr(icmp_skb)->code;
        struct sock *sk;
        struct sk_buff *skb;
-        struct request_sock *req;
        __u32 seq;
        __u32 remaining;
        int err;
@@ -366,12 +379,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        bh_lock_sock(sk);
        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
-         * We do take care of PMTU discovery (RFC1191) special case :
-         * we can receive locally generated ICMP messages while socket is held.
         */
-        if (sock_owned_by_user(sk) &&
+        if (sock_owned_by_user(sk))
-            type != ICMP_DEST_UNREACH &&
-            code != ICMP_FRAG_NEEDED)
                NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
        if (sk->sk_state == TCP_CLOSE)
@@ -384,20 +393,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        icsk = inet_csk(sk);
        tp = tcp_sk(sk);
-        req = tp->fastopen_rsk;
        seq = ntohl(th->seq);
        if (sk->sk_state != TCP_LISTEN &&
-            !between(seq, tp->snd_una, tp->snd_nxt) &&
+            !between(seq, tp->snd_una, tp->snd_nxt)) {
-            (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
-                /* For a Fast Open socket, allow seq to be snt_isn. */
                NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }
        switch (type) {
-        case ICMP_REDIRECT:
-                do_redirect(icmp_skb, sk);
-                goto out;
        case ICMP_SOURCE_QUENCH:
                /* Just silently ignore these. */
                goto out;
@@ -409,13 +412,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                        goto out;
                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
-                        tp->mtu_info = info;
+                        if (!sock_owned_by_user(sk))
-                        if (!sock_owned_by_user(sk)) {
+                                do_pmtu_discovery(sk, iph, info);
-                                tcp_v4_mtu_reduced(sk);
-                        } else {
-                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
-                                        sock_hold(sk);
-                        }
                        goto out;
                }
@@ -428,8 +426,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                    !icsk->icsk_backoff)
                        break;
-                /* XXX (TFO) - revisit the following logic for TFO */
                if (sock_owned_by_user(sk))
                        break;
@@ -461,14 +457,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                goto out;
        }
-        /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
-         * than following the TCP_SYN_RECV case and closing the socket,
-         * we ignore the ICMP error and keep trying like a fully established
-         * socket. Is this the right thing to do?
-         */
-        if (req && req->sk == NULL)
-                goto out;
        switch (sk->sk_state) {
                struct request_sock *req, **prev;
        case TCP_LISTEN:
@@ -501,8 +489,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:  /* Cannot happen.
-                               It can f.e. if SYNs crossed,
+                               It can f.e. if SYNs crossed.
-                               or Fast Open.
                             */
                if (!sock_owned_by_user(sk)) {
                        sk->sk_err = err;
@@ -565,7 +552,7 @@ static void __tcp_v4_send_check(struct sk_buff *skb,
 /* This routine computes an IPv4 TCP checksum. */
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 {
-        const struct inet_sock *inet = inet_sk(sk);
+        struct inet_sock *inet = inet_sk(sk);
        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 }
@@ -603,7 +590,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 {
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        struct {
                struct tcphdr th;
 #ifdef CONFIG_TCP_MD5SIG
@@ -613,10 +600,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
        struct ip_reply_arg arg;
 #ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key;
-        const __u8 *hash_location = NULL;
-        unsigned char newhash[16];
-        int genhash;
-        struct sock *sk1 = NULL;
 #endif
        struct net *net;
@@ -647,36 +630,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
        arg.iov[0].iov_len  = sizeof(rep.th);
 #ifdef CONFIG_TCP_MD5SIG
-        hash_location = tcp_parse_md5sig_option(th);
+        key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
-        if (!sk && hash_location) {
-                /*
-                 * active side is lost. Try to find listening socket through
-                 * source port, and then find md5 key through listening socket.
-                 * we are not loose security here:
-                 * Incoming packet is checked with md5 hash with finding key,
-                 * no RST generated if md5 hash doesn't match.
-                 */
-                sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
-                                             &tcp_hashinfo, ip_hdr(skb)->daddr,
-                                             ntohs(th->source), inet_iif(skb));
-                /* don't send rst if it can't find key */
-                if (!sk1)
-                        return;
-                rcu_read_lock();
-                key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
-                                        &ip_hdr(skb)->saddr, AF_INET);
-                if (!key)
-                        goto release_sk1;
-                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
-                if (genhash || memcmp(hash_location, newhash, 16) != 0)
-                        goto release_sk1;
-        } else {
-                key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
-                                             &ip_hdr(skb)->saddr,
-                                             AF_INET) : NULL;
-        }
        if (key) {
                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
                                   (TCPOPT_NOP << 16) |
@@ -696,28 +650,13 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
        arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
-        /* When socket is gone, all binding information is lost.
-         * routing might fail in this case. No choice here, if we choose to force
-         * input interface, we will misroute in case of asymmetric route.
-         */
-        if (sk)
-                arg.bound_dev_if = sk->sk_bound_dev_if;
        net = dev_net(skb_dst(skb)->dev);
-        arg.tos = ip_hdr(skb)->tos;
+        ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+                      &arg, arg.iov[0].iov_len);
-                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
-#ifdef CONFIG_TCP_MD5SIG
-release_sk1:
-        if (sk1) {
-                rcu_read_unlock();
-                sock_put(sk1);
-        }
-#endif
 }
 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
@@ -727,9 +666,9 @@ release_sk1:
 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
                            u32 win, u32 ts, int oif,
                            struct tcp_md5sig_key *key,
-                            int reply_flags, u8 tos)
+                            int reply_flags)
 {
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        struct {
                struct tcphdr th;
                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
@@ -787,9 +726,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
        if (oif)
                arg.bound_dev_if = oif;
-        arg.tos = tos;
-        ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
+        ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
-                              ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
+                      &arg, arg.iov[0].iov_len);
        TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 }
@@ -804,8 +743,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
                        tcptw->tw_ts_recent,
                        tw->tw_bound_dev_if,
                        tcp_twsk_md5_key(tcptw),
-                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
-                        tw->tw_tos
                        );
        inet_twsk_put(tw);
@@ -814,18 +752,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
 {
-        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+        tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
-         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+                        tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
-         */
-        tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
-                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
-                        tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
                        req->ts_recent,
                        0,
-                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+                        tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
-                                          AF_INET),
+                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
-                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
-                        ip_hdr(skb)->tos);
 }
 /*
@@ -835,9 +767,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
                              struct request_sock *req,
-                              struct request_values *rvp,
+                              struct request_values *rvp)
-                              u16 queue_mapping,
-                              bool nocache)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
@@ -848,31 +778,26 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;
-        skb = tcp_make_synack(sk, dst, req, rvp, NULL);
+        skb = tcp_make_synack(sk, dst, req, rvp);
        if (skb) {
                __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
-                skb_set_queue_mapping(skb, queue_mapping);
                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
                                            ireq->rmt_addr,
                                            ireq->opt);
                err = net_xmit_eval(err);
-                if (!tcp_rsk(req)->snt_synack && !err)
-                        tcp_rsk(req)->snt_synack = tcp_time_stamp;
        }
+        dst_release(dst);
        return err;
 }
 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
-                             struct request_values *rvp)
+                              struct request_values *rvp)
 {
-        int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
+        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+        return tcp_v4_send_synack(sk, NULL, req, rvp);
-        if (!res)
-                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-        return res;
 }
 /*
@@ -884,14 +809,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 }
 /*
- * Return true if a syncookie should be sent
+ * Return 1 if a syncookie should be sent
 */
-bool tcp_syn_flood_action(struct sock *sk,
+int tcp_syn_flood_action(struct sock *sk,
                         const struct sk_buff *skb,
                         const char *proto)
 {
        const char *msg = "Dropping request";
-        bool want_cookie = false;
+        int want_cookie = 0;
        struct listen_sock *lopt;
@@ -899,7 +824,7 @@ bool tcp_syn_flood_action(struct sock *sk,
 #ifdef CONFIG_SYN_COOKIES
        if (sysctl_tcp_syncookies) {
                msg = "Sending cookies";
-                want_cookie = true;
+                want_cookie = 1;
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
        } else
 #endif
@@ -908,7 +833,8 @@ bool tcp_syn_flood_action(struct sock *sk,
        lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
        if (!lopt->synflood_warned) {
                lopt->synflood_warned = 1;
-                pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
+                pr_info("%s: Possible SYN flooding on port %d. %s. "
+                        " Check SNMP counters.\n",
                        proto, ntohs(tcp_hdr(skb)->dest), msg);
        }
        return want_cookie;
@@ -918,7 +844,8 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
 /*
 * Save and compile IPv4 options into the request_sock if needed.
 */
-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
+                                                  struct sk_buff *skb)
 {
        const struct ip_options *opt = &(IPCB(skb)->opt);
        struct ip_options_rcu *dopt = NULL;
@@ -945,138 +872,153 @@ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 */
 /* Find the Key structure for an address.  */
-struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+static struct tcp_md5sig_key *
-                                         const union tcp_md5_addr *addr,
+                        tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
-                                         int family)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_md5sig_key *key;
+        int i;
-        struct hlist_node *pos;
-        unsigned int size = sizeof(struct in_addr);
+        if (!tp->md5sig_info || !tp->md5sig_info->entries4)
-        struct tcp_md5sig_info *md5sig;
-        /* caller either holds rcu_read_lock() or socket lock */
-        md5sig = rcu_dereference_check(tp->md5sig_info,
-                                       sock_owned_by_user(sk) ||
-                                       lockdep_is_held(&sk->sk_lock.slock));
-        if (!md5sig)
                return NULL;
-#if IS_ENABLED(CONFIG_IPV6)
+        for (i = 0; i < tp->md5sig_info->entries4; i++) {
-        if (family == AF_INET6)
+                if (tp->md5sig_info->keys4[i].addr == addr)
-                size = sizeof(struct in6_addr);
+                        return &tp->md5sig_info->keys4[i].base;
-#endif
-        hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
-                if (key->family != family)
-                        continue;
-                if (!memcmp(&key->addr, addr, size))
-                        return key;
        }
        return NULL;
 }
-EXPORT_SYMBOL(tcp_md5_do_lookup);
 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
                                         struct sock *addr_sk)
 {
-        union tcp_md5_addr *addr;
+        return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
-        addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
-        return tcp_md5_do_lookup(sk, addr, AF_INET);
 }
 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
                                                      struct request_sock *req)
 {
-        union tcp_md5_addr *addr;
+        return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
-        addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
-        return tcp_md5_do_lookup(sk, addr, AF_INET);
 }
 /* This can be called on a newly created socket, from other files */
-int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
-                   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
+                      u8 *newkey, u8 newkeylen)
 {
        /* Add Key to the list */
        struct tcp_md5sig_key *key;
        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_md5sig_info *md5sig;
+        struct tcp4_md5sig_key *keys;
-        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
+        key = tcp_v4_md5_do_lookup(sk, addr);
        if (key) {
                /* Pre-existing entry - just update that one. */
-                memcpy(key->key, newkey, newkeylen);
+                kfree(key->key);
+                key->key = newkey;
                key->keylen = newkeylen;
-                return 0;
+        } else {
-        }
+                struct tcp_md5sig_info *md5sig;
+                if (!tp->md5sig_info) {
+                        tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
+                                                  GFP_ATOMIC);
+                        if (!tp->md5sig_info) {
+                                kfree(newkey);
+                                return -ENOMEM;
+                        }
+                        sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+                }
-        md5sig = rcu_dereference_protected(tp->md5sig_info,
+                md5sig = tp->md5sig_info;
-                                           sock_owned_by_user(sk));
+                if (md5sig->entries4 == 0 &&
-        if (!md5sig) {
+                    tcp_alloc_md5sig_pool(sk) == NULL) {
-                md5sig = kmalloc(sizeof(*md5sig), gfp);
+                        kfree(newkey);
-                if (!md5sig)
                        return -ENOMEM;
+                }
-                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+                if (md5sig->alloced4 == md5sig->entries4) {
-                INIT_HLIST_HEAD(&md5sig->head);
+                        keys = kmalloc((sizeof(*keys) *
-                rcu_assign_pointer(tp->md5sig_info, md5sig);
+                                        (md5sig->entries4 + 1)), GFP_ATOMIC);
-        }
+                        if (!keys) {
+                                kfree(newkey);
+                                if (md5sig->entries4 == 0)
+                                        tcp_free_md5sig_pool();
+                                return -ENOMEM;
+                        }
-        key = sock_kmalloc(sk, sizeof(*key), gfp);
+                        if (md5sig->entries4)
-        if (!key)
+                                memcpy(keys, md5sig->keys4,
-                return -ENOMEM;
+                                       sizeof(*keys) * md5sig->entries4);
-        if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
-                sock_kfree_s(sk, key, sizeof(*key));
-                return -ENOMEM;
-        }
-        memcpy(key->key, newkey, newkeylen);
+                        /* Free old key list, and reference new one */
-        key->keylen = newkeylen;
+                        kfree(md5sig->keys4);
-        key->family = family;
+                        md5sig->keys4 = keys;
-        memcpy(&key->addr, addr,
+                        md5sig->alloced4++;
-               (family == AF_INET6) ? sizeof(struct in6_addr) :
+                }
-                                      sizeof(struct in_addr));
+                md5sig->entries4++;
-        hlist_add_head_rcu(&key->node, &md5sig->head);
+                md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
+                md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
+                md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
+        }
        return 0;
 }
-EXPORT_SYMBOL(tcp_md5_do_add);
+EXPORT_SYMBOL(tcp_v4_md5_do_add);
-int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
+static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
+                               u8 *newkey, u8 newkeylen)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
-        struct tcp_md5sig_key *key;
+                                 newkey, newkeylen);
-        struct tcp_md5sig_info *md5sig;
-        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
-        if (!key)
-                return -ENOENT;
-        hlist_del_rcu(&key->node);
-        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
-        kfree_rcu(key, rcu);
-        md5sig = rcu_dereference_protected(tp->md5sig_info,
-                                           sock_owned_by_user(sk));
-        if (hlist_empty(&md5sig->head))
-                tcp_free_md5sig_pool();
-        return 0;
 }
-EXPORT_SYMBOL(tcp_md5_do_del);
-static void tcp_clear_md5_list(struct sock *sk)
+int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_md5sig_key *key;
+        int i;
-        struct hlist_node *pos, *n;
-        struct tcp_md5sig_info *md5sig;
+        for (i = 0; i < tp->md5sig_info->entries4; i++) {
+                if (tp->md5sig_info->keys4[i].addr == addr) {
+                        /* Free the key */
+                        kfree(tp->md5sig_info->keys4[i].base.key);
+                        tp->md5sig_info->entries4--;
+                        if (tp->md5sig_info->entries4 == 0) {
+                                kfree(tp->md5sig_info->keys4);
+                                tp->md5sig_info->keys4 = NULL;
+                                tp->md5sig_info->alloced4 = 0;
+                                tcp_free_md5sig_pool();
+                        } else if (tp->md5sig_info->entries4 != i) {
+                                /* Need to do some manipulation */
+                                memmove(&tp->md5sig_info->keys4[i],
+                                        &tp->md5sig_info->keys4[i+1],
+                                        (tp->md5sig_info->entries4 - i) *
+                                         sizeof(struct tcp4_md5sig_key));
+                        }
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+EXPORT_SYMBOL(tcp_v4_md5_do_del);
-        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+static void tcp_v4_clear_md5_list(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
-        if (!hlist_empty(&md5sig->head))
+        /* Free each key, then the set of key keys,
+         * the crypto element, and then decrement our
+         * hold on the last resort crypto.
+         */
+        if (tp->md5sig_info->entries4) {
+                int i;
+                for (i = 0; i < tp->md5sig_info->entries4; i++)
+                        kfree(tp->md5sig_info->keys4[i].base.key);
+                tp->md5sig_info->entries4 = 0;
                tcp_free_md5sig_pool();
-        hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
+        }
-                hlist_del_rcu(&key->node);
+        if (tp->md5sig_info->keys4) {
-                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+                kfree(tp->md5sig_info->keys4);
-                kfree_rcu(key, rcu);
+                tp->md5sig_info->keys4 = NULL;
+                tp->md5sig_info->alloced4  = 0;
        }
 }
@@ -1085,6 +1027,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 {
        struct tcp_md5sig cmd;
        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+        u8 *newkey;
        if (optlen < sizeof(cmd))
                return -EINVAL;
@@ -1095,16 +1038,32 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
        if (sin->sin_family != AF_INET)
                return -EINVAL;
-        if (!cmd.tcpm_key || !cmd.tcpm_keylen)
+        if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
-                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+                if (!tcp_sk(sk)->md5sig_info)
-                                      AF_INET);
+                        return -ENOENT;
+                return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
+        }
        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
                return -EINVAL;
-        return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+        if (!tcp_sk(sk)->md5sig_info) {
-                              AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+                struct tcp_sock *tp = tcp_sk(sk);
-                              GFP_KERNEL);
+                struct tcp_md5sig_info *p;
+                p = kzalloc(sizeof(*p), sk->sk_allocation);
+                if (!p)
+                        return -EINVAL;
+                tp->md5sig_info = p;
+                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+        }
+        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
+        if (!newkey)
+                return -ENOMEM;
+        return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
+                                 newkey, cmd.tcpm_keylen);
 }
 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
@@ -1130,8 +1089,8 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
        return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
 }
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
-                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
+                               __be32 daddr, __be32 saddr, struct tcphdr *th)
 {
        struct tcp_md5sig_pool *hp;
        struct hash_desc *desc;
@@ -1163,12 +1122,12 @@ clear_hash_noput:
 }
 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
-                        const struct sock *sk, const struct request_sock *req,
+                        struct sock *sk, struct request_sock *req,
-                        const struct sk_buff *skb)
+                        struct sk_buff *skb)
 {
        struct tcp_md5sig_pool *hp;
        struct hash_desc *desc;
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        __be32 saddr, daddr;
        if (sk) {
@@ -1213,7 +1172,7 @@ clear_hash_noput:
 }
 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
 {
        /*
         * This gets called for each TCP segment that arrives
@@ -1223,29 +1182,28 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
         * o MD5 hash and we're not expecting one.
         * o MD5 hash and its wrong.
         */
-        const __u8 *hash_location = NULL;
+        __u8 *hash_location = NULL;
        struct tcp_md5sig_key *hash_expected;
        const struct iphdr *iph = ip_hdr(skb);
-        const struct tcphdr *th = tcp_hdr(skb);
+        struct tcphdr *th = tcp_hdr(skb);
        int genhash;
        unsigned char newhash[16];
-        hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
+        hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
-                                          AF_INET);
        hash_location = tcp_parse_md5sig_option(th);
        /* We've parsed the options - do we have a hash? */
        if (!hash_expected && !hash_location)
-                return false;
+                return 0;
        if (hash_expected && !hash_location) {
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
-                return true;
+                return 1;
        }
        if (!hash_expected && hash_location) {
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
-                return true;
+                return 1;
        }
        /* Okay, so this is hash_expected and hash_location -
@@ -1256,14 +1214,15 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
                                      NULL, NULL, skb);
        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
-                net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+                if (net_ratelimit()) {
-                                     &iph->saddr, ntohs(th->source),
+                        printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
-                                     &iph->daddr, ntohs(th->dest),
+                               &iph->saddr, ntohs(th->source),
-                                     genhash ? " tcp_v4_calc_md5_hash failed"
+                               &iph->daddr, ntohs(th->dest),
-                                     : "");
+                               genhash ? " tcp_v4_calc_md5_hash failed" : "");
-                return true;
+                }
+                return 1;
        }
-        return false;
+        return 0;
 }
 #endif
@@ -1285,189 +1244,11 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 };
 #endif
-static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
-                               struct request_sock *req,
-                               struct tcp_fastopen_cookie *foc,
-                               struct tcp_fastopen_cookie *valid_foc)
-{
-        bool skip_cookie = false;
-        struct fastopen_queue *fastopenq;
-        if (likely(!fastopen_cookie_present(foc))) {
-                /* See include/net/tcp.h for the meaning of these knobs */
-                if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
-                    ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
-                    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
-                        skip_cookie = true; /* no cookie to validate */
-                else
-                        return false;
-        }
-        fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
-        /* A FO option is present; bump the counter. */
-        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
-        /* Make sure the listener has enabled fastopen, and we don't
-         * exceed the max # of pending TFO requests allowed before trying
-         * to validating the cookie in order to avoid burning CPU cycles
-         * unnecessarily.
-         *
-         * XXX (TFO) - The implication of checking the max_qlen before
-         * processing a cookie request is that clients can't differentiate
-         * between qlen overflow causing Fast Open to be disabled
-         * temporarily vs a server not supporting Fast Open at all.
-         */
-        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
-            fastopenq == NULL || fastopenq->max_qlen == 0)
-                return false;
-        if (fastopenq->qlen >= fastopenq->max_qlen) {
-                struct request_sock *req1;
-                spin_lock(&fastopenq->lock);
-                req1 = fastopenq->rskq_rst_head;
-                if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
-                        spin_unlock(&fastopenq->lock);
-                        NET_INC_STATS_BH(sock_net(sk),
-                            LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
-                        /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
-                        foc->len = -1;
-                        return false;
-                }
-                fastopenq->rskq_rst_head = req1->dl_next;
-                fastopenq->qlen--;
-                spin_unlock(&fastopenq->lock);
-                reqsk_free(req1);
-        }
-        if (skip_cookie) {
-                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-                return true;
-        }
-        if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
-                if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
-                        tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
-                        if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
-                            memcmp(&foc->val[0], &valid_foc->val[0],
-                            TCP_FASTOPEN_COOKIE_SIZE) != 0)
-                                return false;
-                        valid_foc->len = -1;
-                }
-                /* Acknowledge the data received from the peer. */
-                tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-                return true;
-        } else if (foc->len == 0) { /* Client requesting a cookie */
-                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
-                NET_INC_STATS_BH(sock_net(sk),
-                    LINUX_MIB_TCPFASTOPENCOOKIEREQD);
-        } else {
-                /* Client sent a cookie with wrong size. Treat it
-                 * the same as invalid and return a valid one.
-                 */
-                tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
-        }
-        return false;
-}
-static int tcp_v4_conn_req_fastopen(struct sock *sk,
-                                    struct sk_buff *skb,
-                                    struct sk_buff *skb_synack,
-                                    struct request_sock *req,
-                                    struct request_values *rvp)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
-        const struct inet_request_sock *ireq = inet_rsk(req);
-        struct sock *child;
-        int err;
-        req->num_retrans = 0;
-        req->num_timeout = 0;
-        req->sk = NULL;
-        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
-        if (child == NULL) {
-                NET_INC_STATS_BH(sock_net(sk),
-                                 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-                kfree_skb(skb_synack);
-                return -1;
-        }
-        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
-                                    ireq->rmt_addr, ireq->opt);
-        err = net_xmit_eval(err);
-        if (!err)
-                tcp_rsk(req)->snt_synack = tcp_time_stamp;
-        /* XXX (TFO) - is it ok to ignore error and continue? */
-        spin_lock(&queue->fastopenq->lock);
-        queue->fastopenq->qlen++;
-        spin_unlock(&queue->fastopenq->lock);
-        /* Initialize the child socket. Have to fix some values to take
-         * into account the child is a Fast Open socket and is created
-         * only out of the bits carried in the SYN packet.
-         */
-        tp = tcp_sk(child);
-        tp->fastopen_rsk = req;
-        /* Do a hold on the listner sk so that if the listener is being
-         * closed, the child that has been accepted can live on and still
-         * access listen_lock.
-         */
-        sock_hold(sk);
-        tcp_rsk(req)->listener = sk;
-        /* RFC1323: The window in SYN & SYN/ACK segments is never
-         * scaled. So correct it appropriately.
-         */
-        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
-        /* Activate the retrans timer so that SYNACK can be retransmitted.
-         * The request socket is not added to the SYN table of the parent
-         * because it's been added to the accept queue directly.
-         */
-        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
-            TCP_TIMEOUT_INIT, TCP_RTO_MAX);
-        /* Add the child socket directly into the accept queue */
-        inet_csk_reqsk_queue_add(sk, req, child);
-        /* Now finish processing the fastopen child socket. */
-        inet_csk(child)->icsk_af_ops->rebuild_header(child);
-        tcp_init_congestion_control(child);
-        tcp_mtup_init(child);
-        tcp_init_buffer_space(child);
-        tcp_init_metrics(child);
-        /* Queue the data carried in the SYN packet. We need to first
-         * bump skb's refcnt because the caller will attempt to free it.
-         *
-         * XXX (TFO) - we honor a zero-payload TFO request for now.
-         * (Any reason not to?)
-         */
-        if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
-                /* Don't queue the skb if there is no payload in SYN.
-                 * XXX (TFO) - How about SYN+FIN?
-                 */
-                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-        } else {
-                skb = skb_get(skb);
-                skb_dst_drop(skb);
-                __skb_pull(skb, tcp_hdr(skb)->doff * 4);
-                skb_set_owner_r(skb, child);
-                __skb_queue_tail(&child->sk_receive_queue, skb);
-                tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-                tp->syn_data_acked = 1;
-        }
-        sk->sk_data_ready(sk, 0);
-        bh_unlock_sock(child);
-        sock_put(child);
-        WARN_ON(req->sk == NULL);
-        return 0;
-}
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_extend_values tmp_ext;
        struct tcp_options_received tmp_opt;
-        const u8 *hash_location;
+        u8 *hash_location;
        struct request_sock *req;
        struct inet_request_sock *ireq;
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1475,12 +1256,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        __be32 saddr = ip_hdr(skb)->saddr;
        __be32 daddr = ip_hdr(skb)->daddr;
        __u32 isn = TCP_SKB_CB(skb)->when;
-        bool want_cookie = false;
+        int want_cookie = 0;
-        struct flowi4 fl4;
-        struct tcp_fastopen_cookie foc = { .len = -1 };
-        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
-        struct sk_buff *skb_synack;
-        int do_fastopen;
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1515,8 +1291,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-        tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
+        tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
-            want_cookie ? NULL : &foc);
        if (tmp_opt.cookie_plus > 0 &&
            tmp_opt.saw_tstamp &&
@@ -1540,7 +1315,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                while (l-- > 0)
                        *c++ ^= *hash_location++;
-                want_cookie = false;    /* not our kind of cookie */
+                want_cookie = 0;        /* not our kind of cookie */
                tmp_ext.cookie_out_never = 0; /* false */
                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
        } else if (!tp->rx_opt.cookie_in_always) {
@@ -1562,18 +1337,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        ireq->loc_addr = daddr;
        ireq->rmt_addr = saddr;
        ireq->no_srccheck = inet_sk(sk)->transparent;
-        ireq->opt = tcp_v4_save_options(skb);
+        ireq->opt = tcp_v4_save_options(sk, skb);
        if (security_inet_conn_request(sk, skb, req))
                goto drop_and_free;
        if (!want_cookie || tmp_opt.tstamp_ok)
-                TCP_ECN_create_request(req, skb);
+                TCP_ECN_create_request(req, tcp_hdr(skb));
        if (want_cookie) {
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
        } else if (!isn) {
+                struct inet_peer *peer = NULL;
+                struct flowi4 fl4;
                /* VJ's idea. We save last timestamp seen
                 * from the destination in peer table, when entering
                 * state TIME-WAIT, and check against it before
@@ -1586,8 +1364,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                if (tmp_opt.saw_tstamp &&
                    tcp_death_row.sysctl_tw_recycle &&
                    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
-                    fl4.daddr == saddr) {
+                    fl4.daddr == saddr &&
-                        if (!tcp_peer_is_proven(req, dst, true)) {
+                    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
+                        inet_peer_refcheck(peer);
+                        if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
+                            (s32)(peer->tcp_ts - req->ts_recent) >
+                                                        TCP_PAWS_WINDOW) {
                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
                                goto drop_and_release;
                        }
@@ -1596,7 +1378,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                else if (!sysctl_tcp_syncookies &&
                         (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                          (sysctl_max_syn_backlog >> 2)) &&
-                         !tcp_peer_is_proven(req, dst, false)) {
+                         (!peer || !peer->tcp_ts_stamp) &&
+                         (!dst || !dst_metric(dst, RTAX_RTT))) {
                        /* Without syncookies last quarter of
                         * backlog is filled with destinations,
                         * proven to be alive.
@@ -1604,7 +1387,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                         * to destinations, already remembered
                         * to the moment of synflood.
                         */
-                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
                                       &saddr, ntohs(tcp_hdr(skb)->source));
                        goto drop_and_release;
                }
@@ -1612,54 +1395,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                isn = tcp_v4_init_sequence(skb);
        }
        tcp_rsk(req)->snt_isn = isn;
+        tcp_rsk(req)->snt_synack = tcp_time_stamp;
-        if (dst == NULL) {
+        if (tcp_v4_send_synack(sk, dst, req,
-                dst = inet_csk_route_req(sk, &fl4, req);
+                               (struct request_values *)&tmp_ext) ||
-                if (dst == NULL)
+            want_cookie)
-                        goto drop_and_free;
-        }
-        do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
-        /* We don't call tcp_v4_send_synack() directly because we need
-         * to make sure a child socket can be created successfully before
-         * sending back synack!
-         *
-         * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
-         * (or better yet, call tcp_send_synack() in the child context
-         * directly, but will have to fix bunch of other code first)
-         * after syn_recv_sock() except one will need to first fix the
-         * latter to remove its dependency on the current implementation
-         * of tcp_v4_send_synack()->tcp_select_initial_window().
-         */
-        skb_synack = tcp_make_synack(sk, dst, req,
-            (struct request_values *)&tmp_ext,
-            fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
-        if (skb_synack) {
-                __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
-                skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
-        } else
-                goto drop_and_free;
-        if (likely(!do_fastopen)) {
-                int err;
-                err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
-                     ireq->rmt_addr, ireq->opt);
-                err = net_xmit_eval(err);
-                if (err || want_cookie)
-                        goto drop_and_free;
-                tcp_rsk(req)->snt_synack = tcp_time_stamp;
-                tcp_rsk(req)->listener = NULL;
-                /* Add the request_sock to the SYN table */
-                inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
-                if (fastopen_cookie_present(&foc) && foc.len != 0)
-                        NET_INC_STATS_BH(sock_net(sk),
-                            LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-        } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
-            (struct request_values *)&tmp_ext))
                goto drop_and_free;
+        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
        return 0;
 drop_and_release:
@@ -1697,7 +1440,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                goto exit_nonewsk;
        newsk->sk_gso_type = SKB_GSO_TCPV4;
-        inet_sk_rx_dst_set(newsk, skb);
        newtp                 = tcp_sk(newsk);
        newinet               = inet_sk(newsk);
@@ -1710,19 +1452,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
        ireq->opt             = NULL;
        newinet->mc_index     = inet_iif(skb);
        newinet->mc_ttl       = ip_hdr(skb)->ttl;
-        newinet->rcv_tos      = ip_hdr(skb)->tos;
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        newinet->inet_id = newtp->write_seq ^ jiffies;
-        if (!dst) {
+        if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
-                dst = inet_csk_route_child_sock(sk, newsk, req);
+                goto put_and_exit;
-                if (!dst)
-                        goto put_and_exit;
-        } else {
-                /* syncookie case : see end of cookie_v4_check() */
-        }
        sk_setup_caps(newsk, dst);
        tcp_mtup_init(newsk);
@@ -1733,13 +1470,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
        tcp_initialize_rcv_mss(newsk);
-        tcp_synack_rtt_meas(newsk, req);
+        if (tcp_rsk(req)->snt_synack)
-        newtp->total_retrans = req->num_retrans;
+                tcp_valid_rtt_meas(newsk,
+                    tcp_time_stamp - tcp_rsk(req)->snt_synack);
+        newtp->total_retrans = req->retrans;
 #ifdef CONFIG_TCP_MD5SIG
        /* Copy over the MD5 key from the original socket */
-        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
+        key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
-                                AF_INET);
        if (key != NULL) {
                /*
                 * We're using one, so create a matching key
@@ -1747,8 +1485,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                 * memory, then we end up not copying the key
                 * across. Shucks.
                 */
-                tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
+                char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
-                               AF_INET, key->key, key->keylen, GFP_ATOMIC);
+                if (newkey != NULL)
+                        tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
+                                          newkey, key->keylen);
                sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
        }
 #endif
@@ -1767,8 +1507,7 @@ exit:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
        return NULL;
 put_and_exit:
-        inet_csk_prepare_forced_close(newsk);
+        sock_put(newsk);
-        tcp_done(newsk);
        goto exit;
 }
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -1783,7 +1522,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
        struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
                                                       iph->saddr, iph->daddr);
        if (req)
-                return tcp_check_req(sk, skb, req, prev, false);
+                return tcp_check_req(sk, skb, req, prev);
        nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
                        th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1849,16 +1588,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 #endif
        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
-                struct dst_entry *dst = sk->sk_rx_dst;
+                sock_rps_save_rxhash(sk, skb->rxhash);
-                sock_rps_save_rxhash(sk, skb);
-                if (dst) {
-                        if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
-                            dst->ops->check(dst, 0) == NULL) {
-                                dst_release(dst);
-                                sk->sk_rx_dst = NULL;
-                        }
-                }
                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
                        rsk = sk;
                        goto reset;
@@ -1875,7 +1605,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                        goto discard;
                if (nsk != sk) {
-                        sock_rps_save_rxhash(nsk, skb);
+                        sock_rps_save_rxhash(nsk, skb->rxhash);
                        if (tcp_child_process(sk, nsk, skb)) {
                                rsk = nsk;
                                goto reset;
@@ -1883,7 +1613,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
                        return 0;
                }
        } else
-                sock_rps_save_rxhash(sk, skb);
+                sock_rps_save_rxhash(sk, skb->rxhash);
        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
                rsk = sk;
@@ -1908,43 +1638,6 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
-void tcp_v4_early_demux(struct sk_buff *skb)
-{
-        const struct iphdr *iph;
-        const struct tcphdr *th;
-        struct sock *sk;
-        if (skb->pkt_type != PACKET_HOST)
-                return;
-        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
-                return;
-        iph = ip_hdr(skb);
-        th = tcp_hdr(skb);
-        if (th->doff < sizeof(struct tcphdr) / 4)
-                return;
-        sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
-                                       iph->saddr, th->source,
-                                       iph->daddr, ntohs(th->dest),
-                                       skb->skb_iif);
-        if (sk) {
-                skb->sk = sk;
-                skb->destructor = sock_edemux;
-                if (sk->sk_state != TCP_TIME_WAIT) {
-                        struct dst_entry *dst = sk->sk_rx_dst;
-                        if (dst)
-                                dst = dst_check(dst, 0);
-                        if (dst &&
-                            inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
-                                skb_dst_set_noref(skb, dst);
-                }
-        }
-}
 /*
 *      From tcp_input.c
 */
@@ -1952,7 +1645,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 int tcp_v4_rcv(struct sk_buff *skb)
 {
        const struct iphdr *iph;
-        const struct tcphdr *th;
+        struct tcphdr *th;
        struct sock *sk;
        int ret;
        struct net *net = dev_net(skb->dev);
@@ -1987,7 +1680,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
                                    skb->len - th->doff * 4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
        TCP_SKB_CB(skb)->when    = 0;
-        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+        TCP_SKB_CB(skb)->flags   = iph->tos;
        TCP_SKB_CB(skb)->sacked  = 0;
        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
@@ -2018,7 +1711,7 @@ process:
 #ifdef CONFIG_NET_DMA
                struct tcp_sock *tp = tcp_sk(sk);
                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-                        tp->ucopy.dma_chan = net_dma_find_channel();
+                        tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
                if (tp->ucopy.dma_chan)
                        ret = tcp_v4_do_rcv(sk, skb);
                else
@@ -2027,8 +1720,7 @@ process:
                        if (!tcp_prequeue(sk, skb))
                                ret = tcp_v4_do_rcv(sk, skb);
                }
-        } else if (unlikely(sk_add_backlog(sk, skb,
+        } else if (unlikely(sk_add_backlog(sk, skb))) {
-                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
                bh_unlock_sock(sk);
                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
                goto discard_and_relse;
@@ -2094,29 +1786,49 @@ do_time_wait:
        goto discard_it;
 }
+struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
+{
+        struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
+        struct inet_sock *inet = inet_sk(sk);
+        struct inet_peer *peer;
+        if (!rt ||
+            inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
+                peer = inet_getpeer_v4(inet->inet_daddr, 1);
+                *release_it = true;
+        } else {
+                if (!rt->peer)
+                        rt_bind_peer(rt, inet->inet_daddr, 1);
+                peer = rt->peer;
+                *release_it = false;
+        }
+        return peer;
+}
+EXPORT_SYMBOL(tcp_v4_get_peer);
+void *tcp_v4_tw_get_peer(struct sock *sk)
+{
+        struct inet_timewait_sock *tw = inet_twsk(sk);
+        return inet_getpeer_v4(tw->tw_daddr, 1);
+}
+EXPORT_SYMBOL(tcp_v4_tw_get_peer);
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
        .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
        .twsk_unique    = tcp_twsk_unique,
        .twsk_destructor= tcp_twsk_destructor,
+        .twsk_getpeer   = tcp_v4_tw_get_peer,
 };
-void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
-{
-        struct dst_entry *dst = skb_dst(skb);
-        dst_hold(dst);
-        sk->sk_rx_dst = dst;
-        inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
-}
-EXPORT_SYMBOL(inet_sk_rx_dst_set);
 const struct inet_connection_sock_af_ops ipv4_specific = {
        .queue_xmit        = ip_queue_xmit,
        .send_check        = tcp_v4_send_check,
        .rebuild_header    = inet_sk_rebuild_header,
-        .sk_rx_dst_set     = inet_sk_rx_dst_set,
        .conn_request      = tcp_v4_conn_request,
        .syn_recv_sock     = tcp_v4_syn_recv_sock,
+        .get_peer          = tcp_v4_get_peer,
        .net_header_len    = sizeof(struct iphdr),
        .setsockopt        = ip_setsockopt,
        .getsockopt        = ip_getsockopt,
@@ -2134,6 +1846,7 @@ EXPORT_SYMBOL(ipv4_specific);
 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
        .md5_lookup             = tcp_v4_md5_lookup,
        .calc_md5_hash          = tcp_v4_md5_hash_skb,
+        .md5_add                = tcp_v4_md5_add_func,
        .md5_parse              = tcp_v4_parse_md5_keys,
 };
 #endif
@@ -2144,15 +1857,63 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
 static int tcp_v4_init_sock(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        tcp_init_sock(sk);
+        skb_queue_head_init(&tp->out_of_order_queue);
+        tcp_init_xmit_timers(sk);
+        tcp_prequeue_init(tp);
-        icsk->icsk_af_ops = &ipv4_specific;
+        icsk->icsk_rto = TCP_TIMEOUT_INIT;
+        tp->mdev = TCP_TIMEOUT_INIT;
+        /* So many TCP implementations out there (incorrectly) count the
+         * initial SYN frame in their delayed-ACK and congestion control
+         * algorithms that we must have the following bandaid to talk
+         * efficiently to them.  -DaveM
+         */
+        tp->snd_cwnd = TCP_INIT_CWND;
+        /* See draft-stevens-tcpca-spec-01 for discussion of the
+         * initialization of these values.
+         */
+        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+        tp->snd_cwnd_clamp = ~0;
+        tp->mss_cache = TCP_MSS_DEFAULT;
+        tp->reordering = sysctl_tcp_reordering;
+        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+        sk->sk_state = TCP_CLOSE;
+        sk->sk_write_space = sk_stream_write_space;
+        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+        icsk->icsk_af_ops = &ipv4_specific;
+        icsk->icsk_sync_mss = tcp_sync_mss;
 #ifdef CONFIG_TCP_MD5SIG
-        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+        tp->af_specific = &tcp_sock_ipv4_specific;
 #endif
+        /* TCP Cookie Transactions */
+        if (sysctl_tcp_cookie_size > 0) {
+                /* Default, cookies without s_data_payload. */
+                tp->cookie_values =
+                        kzalloc(sizeof(*tp->cookie_values),
+                                sk->sk_allocation);
+                if (tp->cookie_values != NULL)
+                        kref_init(&tp->cookie_values->kref);
+        }
+        /* Presumed zeroed, in order of appearance:
+         *      cookie_in_always, cookie_out_never,
+         *      s_data_constant, s_data_in, s_data_out
+         */
+        sk->sk_sndbuf = sysctl_tcp_wmem[1];
+        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+        local_bh_disable();
+        percpu_counter_inc(&tcp_sockets_allocated);
+        local_bh_enable();
        return 0;
 }
@@ -2173,8 +1934,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
 #ifdef CONFIG_TCP_MD5SIG
        /* Clean up the MD5 key list, if any */
        if (tp->md5sig_info) {
-                tcp_clear_md5_list(sk);
+                tcp_v4_clear_md5_list(sk);
-                kfree_rcu(tp->md5sig_info, rcu);
+                kfree(tp->md5sig_info);
                tp->md5sig_info = NULL;
        }
 #endif
@@ -2191,19 +1952,22 @@ void tcp_v4_destroy_sock(struct sock *sk)
        if (inet_csk(sk)->icsk_bind_hash)
                inet_put_port(sk);
+        /*
+         * If sendmsg cached page exists, toss it.
+         */
+        if (sk->sk_sndmsg_page) {
+                __free_page(sk->sk_sndmsg_page);
+                sk->sk_sndmsg_page = NULL;
+        }
        /* TCP Cookie Transactions */
        if (tp->cookie_values != NULL) {
                kref_put(&tp->cookie_values->kref,
                         tcp_cookie_values_release);
                tp->cookie_values = NULL;
        }
-        BUG_ON(tp->fastopen_rsk != NULL);
-        /* If socket is aborted during connect operation */
+        percpu_counter_dec(&tcp_sockets_allocated);
-        tcp_free_fastopen_req(tp);
-        sk_sockets_allocated_dec(sk);
-        sock_release_memcg(sk);
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -2325,7 +2089,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
        return rc;
 }
-static inline bool empty_bucket(struct tcp_iter_state *st)
+static inline int empty_bucket(struct tcp_iter_state *st)
 {
        return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
                hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
@@ -2572,7 +2336,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
        }
 }
-int tcp_seq_open(struct inode *inode, struct file *file)
+static int tcp_seq_open(struct inode *inode, struct file *file)
 {
        struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
        struct tcp_iter_state *s;
@@ -2588,19 +2352,23 @@ int tcp_seq_open(struct inode *inode, struct file *file)
        s->last_pos             = 0;
        return 0;
 }
-EXPORT_SYMBOL(tcp_seq_open);
 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
 {
        int rc = 0;
        struct proc_dir_entry *p;
+        afinfo->seq_fops.open           = tcp_seq_open;
+        afinfo->seq_fops.read           = seq_read;
+        afinfo->seq_fops.llseek         = seq_lseek;
+        afinfo->seq_fops.release        = seq_release_net;
        afinfo->seq_ops.start           = tcp_seq_start;
        afinfo->seq_ops.next            = tcp_seq_next;
        afinfo->seq_ops.stop            = tcp_seq_stop;
        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
-                             afinfo->seq_fops, afinfo);
+                             &afinfo->seq_fops, afinfo);
        if (!p)
                rc = -ENOMEM;
        return rc;
@@ -2613,11 +2381,11 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 }
 EXPORT_SYMBOL(tcp_proc_unregister);
-static void get_openreq4(const struct sock *sk, const struct request_sock *req,
+static void get_openreq4(struct sock *sk, struct request_sock *req,
-                         struct seq_file *f, int i, kuid_t uid, int *len)
+                         struct seq_file *f, int i, int uid, int *len)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
-        long delta = req->expires - jiffies;
+        int ttd = req->expires - jiffies;
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
@@ -2629,9 +2397,9 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
                TCP_SYN_RECV,
                0, 0, /* could print option size, but that is af dependent. */
                1,    /* timers active (only the expire timer) */
-                jiffies_delta_to_clock_t(delta),
+                jiffies_to_clock_t(ttd),
-                req->num_timeout,
+                req->retrans,
-                from_kuid_munged(seq_user_ns(f), uid),
+                uid,
                0,  /* non standard timer */
                0, /* open_requests have no inode */
                atomic_read(&sk->sk_refcnt),
@@ -2643,10 +2411,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 {
        int timer_active;
        unsigned long timer_expires;
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
-        const struct inet_sock *inet = inet_sk(sk);
+        struct inet_sock *inet = inet_sk(sk);
-        struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
@@ -2681,9 +2448,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                tp->write_seq - tp->snd_una,
                rx_queue,
                timer_active,
-                jiffies_delta_to_clock_t(timer_expires - jiffies),
+                jiffies_to_clock_t(timer_expires - jiffies),
                icsk->icsk_retransmits,
-                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+                sock_i_uid(sk),
                icsk->icsk_probes_out,
                sock_i_ino(sk),
                atomic_read(&sk->sk_refcnt), sk,
@@ -2691,18 +2458,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                jiffies_to_clock_t(icsk->icsk_ack.ato),
                (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
                tp->snd_cwnd,
-                sk->sk_state == TCP_LISTEN ?
+                tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
-                    (fastopenq ? fastopenq->max_qlen : 0) :
-                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
                len);
 }
-static void get_timewait4_sock(const struct inet_timewait_sock *tw,
+static void get_timewait4_sock(struct inet_timewait_sock *tw,
                               struct seq_file *f, int i, int *len)
 {
        __be32 dest, src;
        __u16 destp, srcp;
-        long delta = tw->tw_ttd - jiffies;
+        int ttd = tw->tw_ttd - jiffies;
+        if (ttd < 0)
+                ttd = 0;
        dest  = tw->tw_daddr;
        src   = tw->tw_rcv_saddr;
@@ -2712,7 +2480,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
-                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+                3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
                atomic_read(&tw->tw_refcnt), tw, len);
 }
@@ -2749,18 +2517,12 @@ out:
        return 0;
 }
-static const struct file_operations tcp_afinfo_seq_fops = {
-        .owner   = THIS_MODULE,
-        .open    = tcp_seq_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = seq_release_net
-};
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
        .name           = "tcp",
        .family         = AF_INET,
-        .seq_fops       = &tcp_afinfo_seq_fops,
+        .seq_fops       = {
+                .owner          = THIS_MODULE,
+        },
        .seq_ops        = {
                .show           = tcp4_seq_show,
        },
@@ -2795,8 +2557,6 @@ void tcp4_proc_exit(void)
 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
        const struct iphdr *iph = skb_gro_network_header(skb);
-        __wsum wsum;
-        __sum16 sum;
        switch (skb->ip_summed) {
        case CHECKSUM_COMPLETE:
@@ -2805,22 +2565,11 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
                        break;
                }
-flush:
-                NAPI_GRO_CB(skb)->flush = 1;
-                return NULL;
+                /* fall through */
        case CHECKSUM_NONE:
-                wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+                NAPI_GRO_CB(skb)->flush = 1;
-                                          skb_gro_len(skb), IPPROTO_TCP, 0);
+                return NULL;
-                sum = csum_fold(skb_checksum(skb,
-                                             skb_gro_offset(skb),
-                                             skb_gro_len(skb),
-                                             wsum));
-                if (sum)
-                        goto flush;
-                skb->ip_summed = CHECKSUM_UNNECESSARY;
-                break;
        }
        return tcp_gro_receive(head, skb);
@@ -2855,8 +2604,6 @@ struct proto tcp_prot = {
        .sendmsg                = tcp_sendmsg,
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v4_do_rcv,
-        .release_cb             = tcp_release_cb,
-        .mtu_reduced            = tcp_v4_mtu_reduced,
        .hash                   = inet_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
@@ -2865,6 +2612,7 @@ struct proto tcp_prot = {
        .orphan_count           = &tcp_orphan_count,
        .memory_allocated       = &tcp_memory_allocated,
        .memory_pressure        = &tcp_memory_pressure,
+        .sysctl_mem             = sysctl_tcp_mem,
        .sysctl_wmem            = sysctl_tcp_wmem,
        .sysctl_rmem            = sysctl_tcp_rmem,
        .max_header             = MAX_TCP_HEADER,
@@ -2878,21 +2626,19 @@ struct proto tcp_prot = {
        .compat_setsockopt      = compat_tcp_setsockopt,
        .compat_getsockopt      = compat_tcp_getsockopt,
 #endif
-#ifdef CONFIG_MEMCG_KMEM
-        .init_cgroup            = tcp_init_cgroup,
-        .destroy_cgroup         = tcp_destroy_cgroup,
-        .proto_cgroup           = tcp_proto_cgroup,
-#endif
 };
 EXPORT_SYMBOL(tcp_prot);
 static int __net_init tcp_sk_init(struct net *net)
 {
-        return 0;
+        return inet_ctl_sock_create(&net->ipv4.tcp_sock,
+                                    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
 }
 static void __net_exit tcp_sk_exit(struct net *net)
 {
+        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
 }
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index b6f3583ddfe..00000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,291 +0,0 @@
-#include <net/tcp.h>
-#include <net/tcp_memcontrol.h>
-#include <net/sock.h>
-#include <net/ip.h>
-#include <linux/nsproxy.h>
-#include <linux/memcontrol.h>
-#include <linux/module.h>
-static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
-{
-        return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
-}
-static void memcg_tcp_enter_memory_pressure(struct sock *sk)
-{
-        if (sk->sk_cgrp->memory_pressure)
-                *sk->sk_cgrp->memory_pressure = 1;
-}
-EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
-int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
-        /*
-         * The root cgroup does not use res_counters, but rather,
-         * rely on the data already collected by the network
-         * subsystem
-         */
-        struct res_counter *res_parent = NULL;
-        struct cg_proto *cg_proto, *parent_cg;
-        struct tcp_memcontrol *tcp;
-        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-        struct net *net = current->nsproxy->net_ns;
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return 0;
-        tcp = tcp_from_cgproto(cg_proto);
-        tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
-        tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
-        tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
-        tcp->tcp_memory_pressure = 0;
-        parent_cg = tcp_prot.proto_cgroup(parent);
-        if (parent_cg)
-                res_parent = parent_cg->memory_allocated;
-        res_counter_init(&tcp->tcp_memory_allocated, res_parent);
-        percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
-        cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
-        cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
-        cg_proto->sysctl_mem = tcp->tcp_prot_mem;
-        cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
-        cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
-        cg_proto->memcg = memcg;
-        return 0;
-}
-EXPORT_SYMBOL(tcp_init_cgroup);
-void tcp_destroy_cgroup(struct mem_cgroup *memcg)
-{
-        struct cg_proto *cg_proto;
-        struct tcp_memcontrol *tcp;
-        u64 val;
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return;
-        tcp = tcp_from_cgproto(cg_proto);
-        percpu_counter_destroy(&tcp->tcp_sockets_allocated);
-        val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-}
-EXPORT_SYMBOL(tcp_destroy_cgroup);
-static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
-{
-        struct net *net = current->nsproxy->net_ns;
-        struct tcp_memcontrol *tcp;
-        struct cg_proto *cg_proto;
-        u64 old_lim;
-        int i;
-        int ret;
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return -EINVAL;
-        if (val > RESOURCE_MAX)
-                val = RESOURCE_MAX;
-        tcp = tcp_from_cgproto(cg_proto);
-        old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-        ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
-        if (ret)
-                return ret;
-        for (i = 0; i < 3; i++)
-                tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
-                                             net->ipv4.sysctl_tcp_mem[i]);
-        if (val == RESOURCE_MAX)
-                clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-        else if (val != RESOURCE_MAX) {
-                /*
-                 * The active bit needs to be written after the static_key
-                 * update. This is what guarantees that the socket activation
-                 * function is the last one to run. See sock_update_memcg() for
-                 * details, and note that we don't mark any socket as belonging
-                 * to this memcg until that flag is up.
-                 *
-                 * We need to do this, because static_keys will span multiple
-                 * sites, but we can't control their order. If we mark a socket
-                 * as accounted, but the accounting functions are not patched in
-                 * yet, we'll lose accounting.
-                 *
-                 * We never race with the readers in sock_update_memcg(),
-                 * because when this value change, the code to process it is not
-                 * patched in yet.
-                 *
-                 * The activated bit is used to guarantee that no two writers
-                 * will do the update in the same memcg. Without that, we can't
-                 * properly shutdown the static key.
-                 */
-                if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
-                        static_key_slow_inc(&memcg_socket_limit_enabled);
-                set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-        }
-        return 0;
-}
-static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
-                            const char *buffer)
-{
-        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        unsigned long long val;
-        int ret = 0;
-        switch (cft->private) {
-        case RES_LIMIT:
-                /* see memcontrol.c */
-                ret = res_counter_memparse_write_strategy(buffer, &val);
-                if (ret)
-                        break;
-                ret = tcp_update_limit(memcg, val);
-                break;
-        default:
-                ret = -EINVAL;
-                break;
-        }
-        return ret;
-}
-static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
-{
-        struct tcp_memcontrol *tcp;
-        struct cg_proto *cg_proto;
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return default_val;
-        tcp = tcp_from_cgproto(cg_proto);
-        return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
-}
-static u64 tcp_read_usage(struct mem_cgroup *memcg)
-{
-        struct tcp_memcontrol *tcp;
-        struct cg_proto *cg_proto;
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
-        tcp = tcp_from_cgproto(cg_proto);
-        return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
-}
-static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
-{
-        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        u64 val;
-        switch (cft->private) {
-        case RES_LIMIT:
-                val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
-                break;
-        case RES_USAGE:
-                val = tcp_read_usage(memcg);
-                break;
-        case RES_FAILCNT:
-        case RES_MAX_USAGE:
-                val = tcp_read_stat(memcg, cft->private, 0);
-                break;
-        default:
-                BUG();
-        }
-        return val;
-}
-static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event)
-{
-        struct mem_cgroup *memcg;
-        struct tcp_memcontrol *tcp;
-        struct cg_proto *cg_proto;
-        memcg = mem_cgroup_from_cont(cont);
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return 0;
-        tcp = tcp_from_cgproto(cg_proto);
-        switch (event) {
-        case RES_MAX_USAGE:
-                res_counter_reset_max(&tcp->tcp_memory_allocated);
-                break;
-        case RES_FAILCNT:
-                res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
-                break;
-        }
-        return 0;
-}
-unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
-{
-        struct tcp_memcontrol *tcp;
-        struct cg_proto *cg_proto;
-        cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
-        if (!cg_proto)
-                return 0;
-        tcp = tcp_from_cgproto(cg_proto);
-        return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-}
-void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
-{
-        struct tcp_memcontrol *tcp;
-        struct cg_proto *cg_proto;
-        cg_proto = tcp_prot.proto_cgroup(memcg);
-        if (!cg_proto)
-                return;
-        tcp = tcp_from_cgproto(cg_proto);
-        tcp->tcp_prot_mem[idx] = val;
-}
-static struct cftype tcp_files[] = {
-        {
-                .name = "kmem.tcp.limit_in_bytes",
-                .write_string = tcp_cgroup_write,
-                .read_u64 = tcp_cgroup_read,
-                .private = RES_LIMIT,
-        },
-        {
-                .name = "kmem.tcp.usage_in_bytes",
-                .read_u64 = tcp_cgroup_read,
-                .private = RES_USAGE,
-        },
-        {
-                .name = "kmem.tcp.failcnt",
-                .private = RES_FAILCNT,
-                .trigger = tcp_cgroup_reset,
-                .read_u64 = tcp_cgroup_read,
-        },
-        {
-                .name = "kmem.tcp.max_usage_in_bytes",
-                .private = RES_MAX_USAGE,
-                .trigger = tcp_cgroup_reset,
-                .read_u64 = tcp_cgroup_read,
-        },
-        { }     /* terminate */
-};
-static int __init tcp_memcontrol_init(void)
-{
-        WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
-        return 0;
-}
-__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
deleted file mode 100644
index f696d7c2e9f..00000000000
--- a/net/ipv4/tcp_metrics.c
+++ /dev/null
@@ -1,1091 +0,0 @@
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/cache.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/tcp.h>
-#include <linux/hash.h>
-#include <linux/tcp_metrics.h>
-#include <linux/vmalloc.h>
-#include <net/inet_connection_sock.h>
-#include <net/net_namespace.h>
-#include <net/request_sock.h>
-#include <net/inetpeer.h>
-#include <net/sock.h>
-#include <net/ipv6.h>
-#include <net/dst.h>
-#include <net/tcp.h>
-#include <net/genetlink.h>
-int sysctl_tcp_nometrics_save __read_mostly;
-struct tcp_fastopen_metrics {
-        u16     mss;
-        u16     syn_loss:10;            /* Recurring Fast Open SYN losses */
-        unsigned long   last_syn_loss;  /* Last Fast Open SYN loss */
-        struct  tcp_fastopen_cookie     cookie;
-};
-struct tcp_metrics_block {
-        struct tcp_metrics_block __rcu  *tcpm_next;
-        struct inetpeer_addr            tcpm_addr;
-        unsigned long                   tcpm_stamp;
-        u32                             tcpm_ts;
-        u32                             tcpm_ts_stamp;
-        u32                             tcpm_lock;
-        u32                             tcpm_vals[TCP_METRIC_MAX + 1];
-        struct tcp_fastopen_metrics     tcpm_fastopen;
-        struct rcu_head                 rcu_head;
-};
-static bool tcp_metric_locked(struct tcp_metrics_block *tm,
-                              enum tcp_metric_index idx)
-{
-        return tm->tcpm_lock & (1 << idx);
-}
-static u32 tcp_metric_get(struct tcp_metrics_block *tm,
-                          enum tcp_metric_index idx)
-{
-        return tm->tcpm_vals[idx];
-}
-static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
-                                  enum tcp_metric_index idx)
-{
-        return msecs_to_jiffies(tm->tcpm_vals[idx]);
-}
-static void tcp_metric_set(struct tcp_metrics_block *tm,
-                           enum tcp_metric_index idx,
-                           u32 val)
-{
-        tm->tcpm_vals[idx] = val;
-}
-static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
-                                 enum tcp_metric_index idx,
-                                 u32 val)
-{
-        tm->tcpm_vals[idx] = jiffies_to_msecs(val);
-}
-static bool addr_same(const struct inetpeer_addr *a,
-                      const struct inetpeer_addr *b)
-{
-        const struct in6_addr *a6, *b6;
-        if (a->family != b->family)
-                return false;
-        if (a->family == AF_INET)
-                return a->addr.a4 == b->addr.a4;
-        a6 = (const struct in6_addr *) &a->addr.a6[0];
-        b6 = (const struct in6_addr *) &b->addr.a6[0];
-        return ipv6_addr_equal(a6, b6);
-}
-struct tcpm_hash_bucket {
-        struct tcp_metrics_block __rcu  *chain;
-};
-static DEFINE_SPINLOCK(tcp_metrics_lock);
-static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
-{
-        u32 val;
-        tm->tcpm_stamp = jiffies;
-        val = 0;
-        if (dst_metric_locked(dst, RTAX_RTT))
-                val |= 1 << TCP_METRIC_RTT;
-        if (dst_metric_locked(dst, RTAX_RTTVAR))
-                val |= 1 << TCP_METRIC_RTTVAR;
-        if (dst_metric_locked(dst, RTAX_SSTHRESH))
-                val |= 1 << TCP_METRIC_SSTHRESH;
-        if (dst_metric_locked(dst, RTAX_CWND))
-                val |= 1 << TCP_METRIC_CWND;
-        if (dst_metric_locked(dst, RTAX_REORDERING))
-                val |= 1 << TCP_METRIC_REORDERING;
-        tm->tcpm_lock = val;
-        tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
-        tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
-        tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
-        tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
-        tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
-        tm->tcpm_ts = 0;
-        tm->tcpm_ts_stamp = 0;
-        tm->tcpm_fastopen.mss = 0;
-        tm->tcpm_fastopen.syn_loss = 0;
-        tm->tcpm_fastopen.cookie.len = 0;
-}
-static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
-                                          struct inetpeer_addr *addr,
-                                          unsigned int hash,
-                                          bool reclaim)
-{
-        struct tcp_metrics_block *tm;
-        struct net *net;
-        spin_lock_bh(&tcp_metrics_lock);
-        net = dev_net(dst->dev);
-        if (unlikely(reclaim)) {
-                struct tcp_metrics_block *oldest;
-                oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
-                for (tm = rcu_dereference(oldest->tcpm_next); tm;
-                     tm = rcu_dereference(tm->tcpm_next)) {
-                        if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
-                                oldest = tm;
-                }
-                tm = oldest;
-        } else {
-                tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
-                if (!tm)
-                        goto out_unlock;
-        }
-        tm->tcpm_addr = *addr;
-        tcpm_suck_dst(tm, dst);
-        if (likely(!reclaim)) {
-                tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
-                rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
-        }
-out_unlock:
-        spin_unlock_bh(&tcp_metrics_lock);
-        return tm;
-}
-#define TCP_METRICS_TIMEOUT             (60 * 60 * HZ)
-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
-{
-        if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
-                tcpm_suck_dst(tm, dst);
-}
-#define TCP_METRICS_RECLAIM_DEPTH       5
-#define TCP_METRICS_RECLAIM_PTR         (struct tcp_metrics_block *) 0x1UL
-static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
-{
-        if (tm)
-                return tm;
-        if (depth > TCP_METRICS_RECLAIM_DEPTH)
-                return TCP_METRICS_RECLAIM_PTR;
-        return NULL;
-}
-static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
-                                                   struct net *net, unsigned int hash)
-{
-        struct tcp_metrics_block *tm;
-        int depth = 0;
-        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
-             tm = rcu_dereference(tm->tcpm_next)) {
-                if (addr_same(&tm->tcpm_addr, addr))
-                        break;
-                depth++;
-        }
-        return tcp_get_encode(tm, depth);
-}
-static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
-                                                       struct dst_entry *dst)
-{
-        struct tcp_metrics_block *tm;
-        struct inetpeer_addr addr;
-        unsigned int hash;
-        struct net *net;
-        addr.family = req->rsk_ops->family;
-        switch (addr.family) {
-        case AF_INET:
-                addr.addr.a4 = inet_rsk(req)->rmt_addr;
-                hash = (__force unsigned int) addr.addr.a4;
-                break;
-        case AF_INET6:
-                *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
-                hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
-                break;
-        default:
-                return NULL;
-        }
-        net = dev_net(dst->dev);
-        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
-        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
-             tm = rcu_dereference(tm->tcpm_next)) {
-                if (addr_same(&tm->tcpm_addr, &addr))
-                        break;
-        }
-        tcpm_check_stamp(tm, dst);
-        return tm;
-}
-static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
-{
-        struct inet6_timewait_sock *tw6;
-        struct tcp_metrics_block *tm;
-        struct inetpeer_addr addr;
-        unsigned int hash;
-        struct net *net;
-        addr.family = tw->tw_family;
-        switch (addr.family) {
-        case AF_INET:
-                addr.addr.a4 = tw->tw_daddr;
-                hash = (__force unsigned int) addr.addr.a4;
-                break;
-        case AF_INET6:
-                tw6 = inet6_twsk((struct sock *)tw);
-                *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
-                hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
-                break;
-        default:
-                return NULL;
-        }
-        net = twsk_net(tw);
-        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
-        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
-             tm = rcu_dereference(tm->tcpm_next)) {
-                if (addr_same(&tm->tcpm_addr, &addr))
-                        break;
-        }
-        return tm;
-}
-static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
-                                                 struct dst_entry *dst,
-                                                 bool create)
-{
-        struct tcp_metrics_block *tm;
-        struct inetpeer_addr addr;
-        unsigned int hash;
-        struct net *net;
-        bool reclaim;
-        addr.family = sk->sk_family;
-        switch (addr.family) {
-        case AF_INET:
-                addr.addr.a4 = inet_sk(sk)->inet_daddr;
-                hash = (__force unsigned int) addr.addr.a4;
-                break;
-        case AF_INET6:
-                *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
-                hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
-                break;
-        default:
-                return NULL;
-        }
-        net = dev_net(dst->dev);
-        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
-        tm = __tcp_get_metrics(&addr, net, hash);
-        reclaim = false;
-        if (tm == TCP_METRICS_RECLAIM_PTR) {
-                reclaim = true;
-                tm = NULL;
-        }
-        if (!tm && create)
-                tm = tcpm_new(dst, &addr, hash, reclaim);
-        else
-                tcpm_check_stamp(tm, dst);
-        return tm;
-}
-/* Save metrics learned by this TCP session.  This function is called
- * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
- * or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
-        const struct inet_connection_sock *icsk = inet_csk(sk);
-        struct dst_entry *dst = __sk_dst_get(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_metrics_block *tm;
-        unsigned long rtt;
-        u32 val;
-        int m;
-        if (sysctl_tcp_nometrics_save || !dst)
-                return;
-        if (dst->flags & DST_HOST)
-                dst_confirm(dst);
-        rcu_read_lock();
-        if (icsk->icsk_backoff || !tp->srtt) {
-                /* This session failed to estimate rtt. Why?
-                 * Probably, no packets returned in time.  Reset our
-                 * results.
-                 */
-                tm = tcp_get_metrics(sk, dst, false);
-                if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
-                        tcp_metric_set(tm, TCP_METRIC_RTT, 0);
-                goto out_unlock;
-        } else
-                tm = tcp_get_metrics(sk, dst, true);
-        if (!tm)
-                goto out_unlock;
-        rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
-        m = rtt - tp->srtt;
-        /* If newly calculated rtt larger than stored one, store new
-         * one. Otherwise, use EWMA. Remember, rtt overestimation is
-         * always better than underestimation.
-         */
-        if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
-                if (m <= 0)
-                        rtt = tp->srtt;
-                else
-                        rtt -= (m >> 3);
-                tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
-        }
-        if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
-                unsigned long var;
-                if (m < 0)
-                        m = -m;
-                /* Scale deviation to rttvar fixed point */
-                m >>= 1;
-                if (m < tp->mdev)
-                        m = tp->mdev;
-                var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
-                if (m >= var)
-                        var = m;
-                else
-                        var -= (var - m) >> 2;
-                tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
-        }
-        if (tcp_in_initial_slowstart(tp)) {
-                /* Slow start still did not finish. */
-                if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
-                        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
-                        if (val && (tp->snd_cwnd >> 1) > val)
-                                tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
-                                               tp->snd_cwnd >> 1);
-                }
-                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
-                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
-                        if (tp->snd_cwnd > val)
-                                tcp_metric_set(tm, TCP_METRIC_CWND,
-                                               tp->snd_cwnd);
-                }
-        } else if (tp->snd_cwnd > tp->snd_ssthresh &&
-                   icsk->icsk_ca_state == TCP_CA_Open) {
-                /* Cong. avoidance phase, cwnd is reliable. */
-                if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
-                        tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
-                                       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
-                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
-                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
-                        tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
-                }
-        } else {
-                /* Else slow start did not finish, cwnd is non-sense,
-                 * ssthresh may be also invalid.
-                 */
-                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
-                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
-                        tcp_metric_set(tm, TCP_METRIC_CWND,
-                                       (val + tp->snd_ssthresh) >> 1);
-                }
-                if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
-                        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
-                        if (val && tp->snd_ssthresh > val)
-                                tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
-                                               tp->snd_ssthresh);
-                }
-                if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
-                        val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
-                        if (val < tp->reordering &&
-                            tp->reordering != sysctl_tcp_reordering)
-                                tcp_metric_set(tm, TCP_METRIC_REORDERING,
-                                               tp->reordering);
-                }
-        }
-        tm->tcpm_stamp = jiffies;
-out_unlock:
-        rcu_read_unlock();
-}
-/* Initialize metrics on socket. */
-void tcp_init_metrics(struct sock *sk)
-{
-        struct dst_entry *dst = __sk_dst_get(sk);
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_metrics_block *tm;
-        u32 val;
-        if (dst == NULL)
-                goto reset;
-        dst_confirm(dst);
-        rcu_read_lock();
-        tm = tcp_get_metrics(sk, dst, true);
-        if (!tm) {
-                rcu_read_unlock();
-                goto reset;
-        }
-        if (tcp_metric_locked(tm, TCP_METRIC_CWND))
-                tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
-        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
-        if (val) {
-                tp->snd_ssthresh = val;
-                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
-        } else {
-                /* ssthresh may have been reduced unnecessarily during.
-                 * 3WHS. Restore it back to its initial default.
-                 */
-                tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-        }
-        val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
-        if (val && tp->reordering != val) {
-                tcp_disable_fack(tp);
-                tcp_disable_early_retrans(tp);
-                tp->reordering = val;
-        }
-        val = tcp_metric_get(tm, TCP_METRIC_RTT);
-        if (val == 0 || tp->srtt == 0) {
-                rcu_read_unlock();
-                goto reset;
-        }
-        /* Initial rtt is determined from SYN,SYN-ACK.
-         * The segment is small and rtt may appear much
-         * less than real one. Use per-dst memory
-         * to make it more realistic.
-         *
-         * A bit of theory. RTT is time passed after "normal" sized packet
-         * is sent until it is ACKed. In normal circumstances sending small
-         * packets force peer to delay ACKs and calculation is correct too.
-         * The algorithm is adaptive and, provided we follow specs, it
-         * NEVER underestimate RTT. BUT! If peer tries to make some clever
-         * tricks sort of "quick acks" for time long enough to decrease RTT
-         * to low value, and then abruptly stops to do it and starts to delay
-         * ACKs, wait for troubles.
-         */
-        val = msecs_to_jiffies(val);
-        if (val > tp->srtt) {
-                tp->srtt = val;
-                tp->rtt_seq = tp->snd_nxt;
-        }
-        val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
-        if (val > tp->mdev) {
-                tp->mdev = val;
-                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-        }
-        rcu_read_unlock();
-        tcp_set_rto(sk);
-reset:
-        if (tp->srtt == 0) {
-                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
-                 * 3WHS. This is most likely due to retransmission,
-                 * including spurious one. Reset the RTO back to 3secs
-                 * from the more aggressive 1sec to avoid more spurious
-                 * retransmission.
-                 */
-                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
-                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
-        }
-        /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
-         * retransmitted. In light of RFC6298 more aggressive 1sec
-         * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
-         * retransmission has occurred.
-         */
-        if (tp->total_retrans > 1)
-                tp->snd_cwnd = 1;
-        else
-                tp->snd_cwnd = tcp_init_cwnd(tp, dst);
-        tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
-{
-        struct tcp_metrics_block *tm;
-        bool ret;
-        if (!dst)
-                return false;
-        rcu_read_lock();
-        tm = __tcp_get_metrics_req(req, dst);
-        if (paws_check) {
-                if (tm &&
-                    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
-                    (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
-                        ret = false;
-                else
-                        ret = true;
-        } else {
-                if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
-                        ret = true;
-                else
-                        ret = false;
-        }
-        rcu_read_unlock();
-        return ret;
-}
-EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
-void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
-{
-        struct tcp_metrics_block *tm;
-        rcu_read_lock();
-        tm = tcp_get_metrics(sk, dst, true);
-        if (tm) {
-                struct tcp_sock *tp = tcp_sk(sk);
-                if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
-                        tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
-                        tp->rx_opt.ts_recent = tm->tcpm_ts;
-                }
-        }
-        rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
-/* VJ's idea. Save last timestamp seen from this destination and hold
- * it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter
- * synchronized state.
- */
-bool tcp_remember_stamp(struct sock *sk)
-{
-        struct dst_entry *dst = __sk_dst_get(sk);
-        bool ret = false;
-        if (dst) {
-                struct tcp_metrics_block *tm;
-                rcu_read_lock();
-                tm = tcp_get_metrics(sk, dst, true);
-                if (tm) {
-                        struct tcp_sock *tp = tcp_sk(sk);
-                        if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
-                            ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
-                             tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
-                                tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
-                                tm->tcpm_ts = tp->rx_opt.ts_recent;
-                        }
-                        ret = true;
-                }
-                rcu_read_unlock();
-        }
-        return ret;
-}
-bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
-        struct tcp_metrics_block *tm;
-        bool ret = false;
-        rcu_read_lock();
-        tm = __tcp_get_metrics_tw(tw);
-        if (tm) {
-                const struct tcp_timewait_sock *tcptw;
-                struct sock *sk = (struct sock *) tw;
-                tcptw = tcp_twsk(sk);
-                if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
-                    ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
-                     tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
-                        tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
-                        tm->tcpm_ts        = tcptw->tw_ts_recent;
-                }
-                ret = true;
-        }
-        rcu_read_unlock();
-        return ret;
-}
-static DEFINE_SEQLOCK(fastopen_seqlock);
-void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-                            struct tcp_fastopen_cookie *cookie,
-                            int *syn_loss, unsigned long *last_syn_loss)
-{
-        struct tcp_metrics_block *tm;
-        rcu_read_lock();
-        tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
-        if (tm) {
-                struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
-                unsigned int seq;
-                do {
-                        seq = read_seqbegin(&fastopen_seqlock);
-                        if (tfom->mss)
-                                *mss = tfom->mss;
-                        *cookie = tfom->cookie;
-                        *syn_loss = tfom->syn_loss;
-                        *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
-                } while (read_seqretry(&fastopen_seqlock, seq));
-        }
-        rcu_read_unlock();
-}
-void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
-                            struct tcp_fastopen_cookie *cookie, bool syn_lost)
-{
-        struct tcp_metrics_block *tm;
-        rcu_read_lock();
-        tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
-        if (tm) {
-                struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
-                write_seqlock_bh(&fastopen_seqlock);
-                tfom->mss = mss;
-                if (cookie->len > 0)
-                        tfom->cookie = *cookie;
-                if (syn_lost) {
-                        ++tfom->syn_loss;
-                        tfom->last_syn_loss = jiffies;
-                } else
-                        tfom->syn_loss = 0;
-                write_sequnlock_bh(&fastopen_seqlock);
-        }
-        rcu_read_unlock();
-}
-static struct genl_family tcp_metrics_nl_family = {
-        .id             = GENL_ID_GENERATE,
-        .hdrsize        = 0,
-        .name           = TCP_METRICS_GENL_NAME,
-        .version        = TCP_METRICS_GENL_VERSION,
-        .maxattr        = TCP_METRICS_ATTR_MAX,
-        .netnsok        = true,
-};
-static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
-        [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
-        [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
-                                            .len = sizeof(struct in6_addr), },
-        /* Following attributes are not received for GET/DEL,
-         * we keep them for reference
-         */
-#if 0
-        [TCP_METRICS_ATTR_AGE]          = { .type = NLA_MSECS, },
-        [TCP_METRICS_ATTR_TW_TSVAL]     = { .type = NLA_U32, },
-        [TCP_METRICS_ATTR_TW_TS_STAMP]  = { .type = NLA_S32, },
-        [TCP_METRICS_ATTR_VALS]         = { .type = NLA_NESTED, },
-        [TCP_METRICS_ATTR_FOPEN_MSS]    = { .type = NLA_U16, },
-        [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]      = { .type = NLA_U16, },
-        [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]    = { .type = NLA_MSECS, },
-        [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
-                                            .len = TCP_FASTOPEN_COOKIE_MAX, },
-#endif
-};
-/* Add attributes, caller cancels its header on failure */
-static int tcp_metrics_fill_info(struct sk_buff *msg,
-                                 struct tcp_metrics_block *tm)
-{
-        struct nlattr *nest;
-        int i;
-        switch (tm->tcpm_addr.family) {
-        case AF_INET:
-                if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
-                                tm->tcpm_addr.addr.a4) < 0)
-                        goto nla_put_failure;
-                break;
-        case AF_INET6:
-                if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
-                            tm->tcpm_addr.addr.a6) < 0)
-                        goto nla_put_failure;
-                break;
-        default:
-                return -EAFNOSUPPORT;
-        }
-        if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
-                          jiffies - tm->tcpm_stamp) < 0)
-                goto nla_put_failure;
-        if (tm->tcpm_ts_stamp) {
-                if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
-                                (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
-                        goto nla_put_failure;
-                if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
-                                tm->tcpm_ts) < 0)
-                        goto nla_put_failure;
-        }
-        {
-                int n = 0;
-                nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
-                if (!nest)
-                        goto nla_put_failure;
-                for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
-                        if (!tm->tcpm_vals[i])
-                                continue;
-                        if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
-                                goto nla_put_failure;
-                        n++;
-                }
-                if (n)
-                        nla_nest_end(msg, nest);
-                else
-                        nla_nest_cancel(msg, nest);
-        }
-        {
-                struct tcp_fastopen_metrics tfom_copy[1], *tfom;
-                unsigned int seq;
-                do {
-                        seq = read_seqbegin(&fastopen_seqlock);
-                        tfom_copy[0] = tm->tcpm_fastopen;
-                } while (read_seqretry(&fastopen_seqlock, seq));
-                tfom = tfom_copy;
-                if (tfom->mss &&
-                    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
-                                tfom->mss) < 0)
-                        goto nla_put_failure;
-                if (tfom->syn_loss &&
-                    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
-                                tfom->syn_loss) < 0 ||
-                     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
-                                jiffies - tfom->last_syn_loss) < 0))
-                        goto nla_put_failure;
-                if (tfom->cookie.len > 0 &&
-                    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
-                            tfom->cookie.len, tfom->cookie.val) < 0)
-                        goto nla_put_failure;
-        }
-        return 0;
-nla_put_failure:
-        return -EMSGSIZE;
-}
-static int tcp_metrics_dump_info(struct sk_buff *skb,
-                                 struct netlink_callback *cb,
-                                 struct tcp_metrics_block *tm)
-{
-        void *hdr;
-        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
-                          &tcp_metrics_nl_family, NLM_F_MULTI,
-                          TCP_METRICS_CMD_GET);
-        if (!hdr)
-                return -EMSGSIZE;
-        if (tcp_metrics_fill_info(skb, tm) < 0)
-                goto nla_put_failure;
-        return genlmsg_end(skb, hdr);
-nla_put_failure:
-        genlmsg_cancel(skb, hdr);
-        return -EMSGSIZE;
-}
-static int tcp_metrics_nl_dump(struct sk_buff *skb,
-                               struct netlink_callback *cb)
-{
-        struct net *net = sock_net(skb->sk);
-        unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
-        unsigned int row, s_row = cb->args[0];
-        int s_col = cb->args[1], col = s_col;
-        for (row = s_row; row < max_rows; row++, s_col = 0) {
-                struct tcp_metrics_block *tm;
-                struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
-                rcu_read_lock();
-                for (col = 0, tm = rcu_dereference(hb->chain); tm;
-                     tm = rcu_dereference(tm->tcpm_next), col++) {
-                        if (col < s_col)
-                                continue;
-                        if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
-                                rcu_read_unlock();
-                                goto done;
-                        }
-                }
-                rcu_read_unlock();
-        }
-done:
-        cb->args[0] = row;
-        cb->args[1] = col;
-        return skb->len;
-}
-static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
-                         unsigned int *hash, int optional)
-{
-        struct nlattr *a;
-        a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
-        if (a) {
-                addr->family = AF_INET;
-                addr->addr.a4 = nla_get_be32(a);
-                *hash = (__force unsigned int) addr->addr.a4;
-                return 0;
-        }
-        a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
-        if (a) {
-                if (nla_len(a) != sizeof(struct in6_addr))
-                        return -EINVAL;
-                addr->family = AF_INET6;
-                memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
-                *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
-                return 0;
-        }
-        return optional ? 1 : -EAFNOSUPPORT;
-}
-static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
-{
-        struct tcp_metrics_block *tm;
-        struct inetpeer_addr addr;
-        unsigned int hash;
-        struct sk_buff *msg;
-        struct net *net = genl_info_net(info);
-        void *reply;
-        int ret;
-        ret = parse_nl_addr(info, &addr, &hash, 0);
-        if (ret < 0)
-                return ret;
-        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-        if (!msg)
-                return -ENOMEM;
-        reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
-                                  info->genlhdr->cmd);
-        if (!reply)
-                goto nla_put_failure;
-        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
-        ret = -ESRCH;
-        rcu_read_lock();
-        for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
-             tm = rcu_dereference(tm->tcpm_next)) {
-                if (addr_same(&tm->tcpm_addr, &addr)) {
-                        ret = tcp_metrics_fill_info(msg, tm);
-                        break;
-                }
-        }
-        rcu_read_unlock();
-        if (ret < 0)
-                goto out_free;
-        genlmsg_end(msg, reply);
-        return genlmsg_reply(msg, info);
-nla_put_failure:
-        ret = -EMSGSIZE;
-out_free:
-        nlmsg_free(msg);
-        return ret;
-}
-#define deref_locked_genl(p)    \
-        rcu_dereference_protected(p, lockdep_genl_is_held() && \
-                                     lockdep_is_held(&tcp_metrics_lock))
-#define deref_genl(p)   rcu_dereference_protected(p, lockdep_genl_is_held())
-static int tcp_metrics_flush_all(struct net *net)
-{
-        unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
-        struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
-        struct tcp_metrics_block *tm;
-        unsigned int row;
-        for (row = 0; row < max_rows; row++, hb++) {
-                spin_lock_bh(&tcp_metrics_lock);
-                tm = deref_locked_genl(hb->chain);
-                if (tm)
-                        hb->chain = NULL;
-                spin_unlock_bh(&tcp_metrics_lock);
-                while (tm) {
-                        struct tcp_metrics_block *next;
-                        next = deref_genl(tm->tcpm_next);
-                        kfree_rcu(tm, rcu_head);
-                        tm = next;
-                }
-        }
-        return 0;
-}
-static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
-{
-        struct tcpm_hash_bucket *hb;
-        struct tcp_metrics_block *tm;
-        struct tcp_metrics_block __rcu **pp;
-        struct inetpeer_addr addr;
-        unsigned int hash;
-        struct net *net = genl_info_net(info);
-        int ret;
-        ret = parse_nl_addr(info, &addr, &hash, 1);
-        if (ret < 0)
-                return ret;
-        if (ret > 0)
-                return tcp_metrics_flush_all(net);
-        hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
-        hb = net->ipv4.tcp_metrics_hash + hash;
-        pp = &hb->chain;
-        spin_lock_bh(&tcp_metrics_lock);
-        for (tm = deref_locked_genl(*pp); tm;
-             pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
-                if (addr_same(&tm->tcpm_addr, &addr)) {
-                        *pp = tm->tcpm_next;
-                        break;
-                }
-        }
-        spin_unlock_bh(&tcp_metrics_lock);
-        if (!tm)
-                return -ESRCH;
-        kfree_rcu(tm, rcu_head);
-        return 0;
-}
-static struct genl_ops tcp_metrics_nl_ops[] = {
-        {
-                .cmd = TCP_METRICS_CMD_GET,
-                .doit = tcp_metrics_nl_cmd_get,
-                .dumpit = tcp_metrics_nl_dump,
-                .policy = tcp_metrics_nl_policy,
-                .flags = GENL_ADMIN_PERM,
-        },
-        {
-                .cmd = TCP_METRICS_CMD_DEL,
-                .doit = tcp_metrics_nl_cmd_del,
-                .policy = tcp_metrics_nl_policy,
-                .flags = GENL_ADMIN_PERM,
-        },
-};
-static unsigned int tcpmhash_entries;
-static int __init set_tcpmhash_entries(char *str)
-{
-        ssize_t ret;
-        if (!str)
-                return 0;
-        ret = kstrtouint(str, 0, &tcpmhash_entries);
-        if (ret)
-                return 0;
-        return 1;
-}
-__setup("tcpmhash_entries=", set_tcpmhash_entries);
-static int __net_init tcp_net_metrics_init(struct net *net)
-{
-        size_t size;
-        unsigned int slots;
-        slots = tcpmhash_entries;
-        if (!slots) {
-                if (totalram_pages >= 128 * 1024)
-                        slots = 16 * 1024;
-                else
-                        slots = 8 * 1024;
-        }
-        net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
-        size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
-        net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-        if (!net->ipv4.tcp_metrics_hash)
-                net->ipv4.tcp_metrics_hash = vzalloc(size);
-        if (!net->ipv4.tcp_metrics_hash)
-                return -ENOMEM;
-        return 0;
-}
-static void __net_exit tcp_net_metrics_exit(struct net *net)
-{
-        unsigned int i;
-        for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
-                struct tcp_metrics_block *tm, *next;
-                tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
-                while (tm) {
-                        next = rcu_dereference_protected(tm->tcpm_next, 1);
-                        kfree(tm);
-                        tm = next;
-                }
-        }
-        if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
-                vfree(net->ipv4.tcp_metrics_hash);
-        else
-                kfree(net->ipv4.tcp_metrics_hash);
-}
-static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
-        .init   =       tcp_net_metrics_init,
-        .exit   =       tcp_net_metrics_exit,
-};
-void __init tcp_metrics_init(void)
-{
-        int ret;
-        ret = register_pernet_subsys(&tcp_net_metrics_ops);
-        if (ret < 0)
-                goto cleanup;
-        ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
-                                            tcp_metrics_nl_ops,
-                                            ARRAY_SIZE(tcp_metrics_nl_ops));
-        if (ret < 0)
-                goto cleanup_subsys;
-        return;
-cleanup_subsys:
-        unregister_pernet_subsys(&tcp_net_metrics_ops);
-cleanup:
-        return;
-}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f35f2dfb640..0ce3d06dce6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,12 +49,62 @@ struct inet_timewait_death_row tcp_death_row = {
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
-static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+static int tcp_remember_stamp(struct sock *sk)
+{
+        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_peer *peer;
+        bool release_it;
+        peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
+        if (peer) {
+                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
+                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+                     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+                        peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+                        peer->tcp_ts = tp->rx_opt.ts_recent;
+                }
+                if (release_it)
+                        inet_putpeer(peer);
+                return 1;
+        }
+        return 0;
+}
+static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+        struct sock *sk = (struct sock *) tw;
+        struct inet_peer *peer;
+        peer = twsk_getpeer(sk);
+        if (peer) {
+                const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
+                    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+                     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+                        peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+                        peer->tcp_ts       = tcptw->tw_ts_recent;
+                }
+                inet_putpeer(peer);
+                return 1;
+        }
+        return 0;
+}
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
        if (seq == s_win)
-                return true;
+                return 1;
        if (after(end_seq, s_win) && before(seq, e_win))
-                return true;
+                return 1;
        return seq == e_win && seq == end_seq;
 }
@@ -85,21 +135,19 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 * spinlock it. I do not want! Well, probability of misbehaviour
 * is ridiculously low and, seems, we could use some mb() tricks
 * to avoid misread sequence numbers, states etc.  --ANK
- *
- * We don't need to initialize tmp_out.sack_ok as we don't use the results
 */
 enum tcp_tw_status
 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
                           const struct tcphdr *th)
 {
        struct tcp_options_received tmp_opt;
-        const u8 *hash_location;
+        u8 *hash_location;
        struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
-        bool paws_reject = false;
+        int paws_reject = 0;
        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+                tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
                if (tmp_opt.saw_tstamp) {
                        tmp_opt.ts_recent       = tcptw->tw_ts_recent;
@@ -268,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        struct inet_timewait_sock *tw = NULL;
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_sock *tp = tcp_sk(sk);
-        bool recycle_ok = false;
+        int recycle_ok = 0;
        if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
                recycle_ok = tcp_remember_stamp(sk);
@@ -279,9 +327,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
        if (tw != NULL) {
                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
                const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-                struct inet_sock *inet = inet_sk(sk);
-                tw->tw_transparent      = inet->transparent;
+                tw->tw_transparent      = inet_sk(sk)->transparent;
                tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
                tcptw->tw_rcv_nxt       = tp->rcv_nxt;
                tcptw->tw_snd_nxt       = tp->snd_nxt;
@@ -289,16 +336,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
                tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                if (tw->tw_family == PF_INET6) {
                        struct ipv6_pinfo *np = inet6_sk(sk);
                        struct inet6_timewait_sock *tw6;
                        tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
                        tw6 = inet6_twsk((struct sock *)tw);
-                        tw6->tw_v6_daddr = np->daddr;
+                        ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
-                        tw6->tw_v6_rcv_saddr = np->rcv_saddr;
+                        ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
-                        tw->tw_tclass = np->tclass;
                        tw->tw_ipv6only = np->ipv6only;
                }
 #endif
@@ -312,11 +358,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                 */
                do {
                        struct tcp_md5sig_key *key;
-                        tcptw->tw_md5_key = NULL;
+                        memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
+                        tcptw->tw_md5_keylen = 0;
                        key = tp->af_specific->md5_lookup(sk, sk);
                        if (key != NULL) {
-                                tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+                                memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
-                                if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL)
+                                tcptw->tw_md5_keylen = key->keylen;
+                                if (tcp_alloc_md5sig_pool(sk) == NULL)
                                        BUG();
                        }
                } while (0);
@@ -356,11 +404,8 @@ void tcp_twsk_destructor(struct sock *sk)
 {
 #ifdef CONFIG_TCP_MD5SIG
        struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+        if (twsk->tw_md5_keylen)
-        if (twsk->tw_md5_key) {
                tcp_free_md5sig_pool();
-                kfree_rcu(twsk->tw_md5_key, rcu);
-        }
 #endif
 }
 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
@@ -379,7 +424,7 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
 */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-        struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+        struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
        if (newsk != NULL) {
                const struct inet_request_sock *ireq = inet_rsk(req);
@@ -424,7 +469,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        treq->snt_isn + 1 + tcp_s_data_size(oldtp);
                tcp_prequeue_init(newtp);
-                INIT_LIST_HEAD(&newtp->tsq_node);
                tcp_init_wl(newtp, treq->rcv_isn);
@@ -437,7 +481,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->sacked_out = 0;
                newtp->fackets_out = 0;
                newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-                tcp_enable_early_retrans(newtp);
                /* So many TCP implementations out there (incorrectly) count the
                 * initial SYN frame in their delayed-ACK and congestion control
@@ -451,9 +494,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->frto_counter = 0;
                newtp->frto_highmark = 0;
-                if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
+                newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
-                    !try_module_get(newicsk->icsk_ca_ops->owner))
-                        newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
                tcp_set_ca_state(newsk, TCP_CA_Open);
                tcp_init_xmit_timers(newsk);
@@ -509,8 +550,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
                newtp->rx_opt.mss_clamp = req->mss;
                TCP_ECN_openreq_child(newtp, req);
-                newtp->fastopen_rsk = NULL;
-                newtp->syn_data_acked = 0;
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
        }
@@ -519,33 +558,24 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 EXPORT_SYMBOL(tcp_create_openreq_child);
 /*
- * Process an incoming packet for SYN_RECV sockets represented as a
+ *      Process an incoming packet for SYN_RECV sockets represented
- * request_sock. Normally sk is the listener socket but for TFO it
+ *      as a request_sock.
- * points to the child socket.
- *
- * XXX (TFO) - The current impl contains a special check for ack
- * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
- *
- * We don't need to initialize tmp_opt.sack_ok as we don't use the results
 */
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req,
-                           struct request_sock **prev,
+                           struct request_sock **prev)
-                           bool fastopen)
 {
        struct tcp_options_received tmp_opt;
-        const u8 *hash_location;
+        u8 *hash_location;
        struct sock *child;
        const struct tcphdr *th = tcp_hdr(skb);
        __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
-        bool paws_reject = false;
+        int paws_reject = 0;
-        BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
-                tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+                tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
                if (tmp_opt.saw_tstamp) {
                        tmp_opt.ts_recent = req->ts_recent;
@@ -553,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                         * it can be estimated (approximately)
                         * from another data.
                         */
-                        tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+                        tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
                }
        }
@@ -578,11 +608,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                 *
                 * Enforce "SYN-ACK" according to figure 8, figure 6
                 * of RFC793, fixed by RFC1122.
-                 *
-                 * Note that even if there is new data in the SYN packet
-                 * they will be thrown away too.
                 */
-                inet_rtx_syn_ack(sk, req);
+                req->rsk_ops->rtx_syn_ack(sk, req, NULL);
                return NULL;
        }
@@ -638,12 +665,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
         *                  sent (the segment carries an unacceptable ACK) ...
         *                  a reset is sent."
         *
-         * Invalid ACK: reset will be sent by listening socket.
+         * Invalid ACK: reset will be sent by listening socket
-         * Note that the ACK validity check for a Fast Open socket is done
-         * elsewhere and is checked directly against the child socket rather
-         * than req because user data may have been sent out.
         */
-        if ((flg & TCP_FLAG_ACK) && !fastopen &&
+        if ((flg & TCP_FLAG_ACK) &&
            (TCP_SKB_CB(skb)->ack_seq !=
             tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
                return sk;
@@ -656,7 +680,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* RFC793: "first check sequence number". */
        if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-                                          tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+                                          tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
                /* Out of window: send ACK and drop. */
                if (!(flg & TCP_FLAG_RST))
                        req->rsk_ops->send_ack(sk, skb, req);
@@ -667,7 +691,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* In sequence, PAWS is OK. */
-        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
                req->ts_recent = tmp_opt.rcv_tsval;
        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -686,32 +710,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        /* ACK sequence verified above, just make sure ACK is
         * set.  If ACK not set, just silently drop the packet.
-         *
-         * XXX (TFO) - if we ever allow "data after SYN", the
-         * following check needs to be removed.
         */
        if (!(flg & TCP_FLAG_ACK))
                return NULL;
-        /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
-        if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
-                tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
-        else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
-                tcp_rsk(req)->snt_synack = 0;
-        /* For Fast Open no more processing is needed (sk is the
-         * child socket).
-         */
-        if (fastopen)
-                return sk;
        /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
-        if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+        if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
            TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
                inet_rsk(req)->acked = 1;
                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
                return NULL;
        }
+        if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
+                tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
+        else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+                tcp_rsk(req)->snt_synack = 0;
        /* OK, ACK is valid, create big socket and
         * feed this segment to it. It will repeat all
@@ -736,21 +749,11 @@ listen_overflow:
        }
 embryonic_reset:
-        if (!(flg & TCP_FLAG_RST)) {
+        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
-                /* Received a bad SYN pkt - for TFO We try not to reset
+        if (!(flg & TCP_FLAG_RST))
-                 * the local connection unless it's really necessary to
-                 * avoid becoming vulnerable to outside attack aiming at
-                 * resetting legit local connections.
-                 */
                req->rsk_ops->send_reset(sk, skb);
-        } else if (fastopen) { /* received a valid RST pkt */
-                reqsk_fastopen_remove(sk, req, true);
+        inet_csk_reqsk_queue_drop(sk, req, prev);
-                tcp_reset(sk);
-        }
-        if (!fastopen) {
-                inet_csk_reqsk_queue_drop(sk, req, prev);
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
-        }
        return NULL;
 }
 EXPORT_SYMBOL(tcp_check_req);
@@ -759,12 +762,6 @@ EXPORT_SYMBOL(tcp_check_req);
 * Queue segment on the new socket if the new socket is active,
 * otherwise we just shortcircuit this and continue with
 * the new socket.
- *
- * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
- * when entering. But other states are possible due to a race condition
- * where after __inet_lookup_established() fails but before the listener
- * locked is obtained, other packets cause the same connection to
- * be created.
 */
 int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5d451593ef1..faf257b9415 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -34,8 +34,6 @@
 *
 */
-#define pr_fmt(fmt) "TCP: " fmt
 #include <net/tcp.h>
 #include <linux/compiler.h>
@@ -50,9 +48,6 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
 */
 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
-/* Default TSQ limit of two TSO segments */
-int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
 /* This limits the percentage of the congestion window which we
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
@@ -68,11 +63,9 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
-                           int push_one, gfp_t gfp);
 /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int prior_packets = tp->packets_out;
@@ -85,8 +78,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
                tp->frto_counter = 3;
        tp->packets_out += tcp_skb_pcount(skb);
-        if (!prior_packets || tp->early_retrans_delayed)
+        if (!prior_packets)
-                tcp_rearm_rto(sk);
+                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                          inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 }
 /* SND.NXT, if window was not shrunk.
@@ -95,9 +89,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
 * invalid. OK, let's make this for now:
 */
-static inline __u32 tcp_acceptable_seq(const struct sock *sk)
+static inline __u32 tcp_acceptable_seq(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        if (!before(tcp_wnd_end(tp), tp->snd_nxt))
                return tp->snd_nxt;
@@ -122,7 +116,7 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
 static __u16 tcp_advertise_mss(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        const struct dst_entry *dst = __sk_dst_get(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
        int mss = tp->advmss;
        if (dst) {
@@ -139,7 +133,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        s32 delta = tcp_time_stamp - tp->lsndtime;
@@ -160,7 +154,7 @@ static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
 /* Congestion state accounting after a packet has been sent. */
 static void tcp_event_data_sent(struct tcp_sock *tp,
-                                struct sock *sk)
+                                struct sk_buff *skb, struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        const u32 now = tcp_time_stamp;
@@ -301,11 +295,11 @@ static u16 tcp_select_window(struct sock *sk)
 }
 /* Packet ECN state for a SYN-ACK */
-static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
 {
-        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
+        TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
        if (!(tp->ecn_flags & TCP_ECN_OK))
-                TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+                TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
 }
 /* Packet ECN state for a SYN.  */
@@ -315,13 +309,13 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
        tp->ecn_flags = 0;
        if (sysctl_tcp_ecn == 1) {
-                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
+                TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
                tp->ecn_flags = TCP_ECN_OK;
        }
 }
 static __inline__ void
-TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
+TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
 {
        if (inet_rsk(req)->ecn_ok)
                th->ece = 1;
@@ -362,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum = 0;
-        TCP_SKB_CB(skb)->tcp_flags = flags;
+        TCP_SKB_CB(skb)->flags = flags;
        TCP_SKB_CB(skb)->sacked = 0;
        skb_shinfo(skb)->gso_segs = 1;
@@ -375,7 +369,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
        TCP_SKB_CB(skb)->end_seq = seq;
 }
-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
 {
        return tp->snd_una != tp->snd_up;
 }
@@ -385,17 +379,15 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5              (1 << 2)
 #define OPTION_WSCALE           (1 << 3)
 #define OPTION_COOKIE_EXTENSION (1 << 4)
-#define OPTION_FAST_OPEN_COOKIE (1 << 8)
 struct tcp_out_options {
-        u16 options;            /* bit field of OPTION_* */
+        u8 options;             /* bit field of OPTION_* */
-        u16 mss;                /* 0 to disable */
        u8 ws;                  /* window scale, 0 to disable */
        u8 num_sack_blocks;     /* number of SACK blocks to include */
        u8 hash_size;           /* bytes in hash_location */
-        __u8 *hash_location;    /* temporary pointer, overloaded */
+        u16 mss;                /* 0 to disable */
        __u32 tsval, tsecr;     /* need to include OPTION_TS */
-        struct tcp_fastopen_cookie *fastopen_cookie;    /* Fast open cookie */
+        __u8 *hash_location;    /* temporary pointer, overloaded */
 };
 /* The sysctl int routines are generic, so check consistency here.
@@ -444,7 +436,7 @@ static u8 tcp_cookie_size_check(u8 desired)
 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                              struct tcp_out_options *opts)
 {
-        u16 options = opts->options;    /* mungable copy */
+        u8 options = opts->options;     /* mungable copy */
        /* Having both authentication and cookies for security is redundant,
         * and there's certainly not enough room.  Instead, the cookie-less
@@ -566,37 +558,20 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                tp->rx_opt.dsack = 0;
        }
-        if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
-                struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
-                *ptr++ = htonl((TCPOPT_EXP << 24) |
-                               ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
-                               TCPOPT_FASTOPEN_MAGIC);
-                memcpy(ptr, foc->val, foc->len);
-                if ((foc->len & 3) == 2) {
-                        u8 *align = ((u8 *)ptr) + foc->len;
-                        align[0] = align[1] = TCPOPT_NOP;
-                }
-                ptr += (foc->len + 3) >> 2;
-        }
 }
 /* Compute TCP options for SYN packets. This is not the final
 * network wire format yet.
 */
-static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                                struct tcp_out_options *opts,
-                                struct tcp_md5sig_key **md5)
+                                struct tcp_md5sig_key **md5) {
-{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_cookie_values *cvp = tp->cookie_values;
-        unsigned int remaining = MAX_TCP_OPTION_SPACE;
+        unsigned remaining = MAX_TCP_OPTION_SPACE;
        u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
                         tcp_cookie_size_check(cvp->cookie_desired) :
                         0;
-        struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 #ifdef CONFIG_TCP_MD5SIG
        *md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -637,16 +612,6 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
-        if (fastopen && fastopen->cookie.len >= 0) {
-                u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
-                need = (need + 3) & ~3U;  /* Align to 32 bits */
-                if (remaining >= need) {
-                        opts->options |= OPTION_FAST_OPEN_COOKIE;
-                        opts->fastopen_cookie = &fastopen->cookie;
-                        remaining -= need;
-                        tp->syn_fastopen = 1;
-                }
-        }
        /* Note that timestamps are required by the specification.
         *
         * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -697,16 +662,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 }
 /* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct sock *sk,
+static unsigned tcp_synack_options(struct sock *sk,
                                   struct request_sock *req,
-                                   unsigned int mss, struct sk_buff *skb,
+                                   unsigned mss, struct sk_buff *skb,
                                   struct tcp_out_options *opts,
                                   struct tcp_md5sig_key **md5,
-                                   struct tcp_extend_values *xvp,
+                                   struct tcp_extend_values *xvp)
-                                   struct tcp_fastopen_cookie *foc)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
-        unsigned int remaining = MAX_TCP_OPTION_SPACE;
+        unsigned remaining = MAX_TCP_OPTION_SPACE;
        u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
                         xvp->cookie_plus :
                         0;
@@ -748,15 +712,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
                if (unlikely(!ireq->tstamp_ok))
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
-        if (foc != NULL) {
-                u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
-                need = (need + 3) & ~3U;  /* Align to 32 bits */
-                if (remaining >= need) {
-                        opts->options |= OPTION_FAST_OPEN_COOKIE;
-                        opts->fastopen_cookie = foc;
-                        remaining -= need;
-                }
-        }
        /* Similar rationale to tcp_syn_options() applies here, too.
         * If the <SYN> options fit, the same options should fit now!
         */
@@ -785,13 +741,12 @@ static unsigned int tcp_synack_options(struct sock *sk,
 /* Compute TCP options for ESTABLISHED sockets. This is not the
 * final wire format yet.
 */
-static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
+static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
                                        struct tcp_out_options *opts,
-                                        struct tcp_md5sig_key **md5)
+                                        struct tcp_md5sig_key **md5) {
-{
        struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
        struct tcp_sock *tp = tcp_sk(sk);
-        unsigned int size = 0;
+        unsigned size = 0;
        unsigned int eff_sacks;
 #ifdef CONFIG_TCP_MD5SIG
@@ -813,9 +768,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
        eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
        if (unlikely(eff_sacks)) {
-                const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+                const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
                opts->num_sack_blocks =
-                        min_t(unsigned int, eff_sacks,
+                        min_t(unsigned, eff_sacks,
                              (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
                              TCPOLEN_SACK_PERBLOCK);
                size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -825,160 +780,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
        return size;
 }
-/* TCP SMALL QUEUES (TSQ)
- *
- * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
- * to reduce RTT and bufferbloat.
- * We do this using a special skb destructor (tcp_wfree).
- *
- * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
- * needs to be reallocated in a driver.
- * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
- *
- * Since transmit from skb destructor is forbidden, we use a tasklet
- * to process all sockets that eventually need to send more skbs.
- * We use one tasklet per cpu, with its own queue of sockets.
- */
-struct tsq_tasklet {
-        struct tasklet_struct   tasklet;
-        struct list_head        head; /* queue of tcp sockets */
-};
-static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
-static void tcp_tsq_handler(struct sock *sk)
-{
-        if ((1 << sk->sk_state) &
-            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
-             TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
-                tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
-}
-/*
- * One tasklest per cpu tries to send more skbs.
- * We run in tasklet context but need to disable irqs when
- * transfering tsq->head because tcp_wfree() might
- * interrupt us (non NAPI drivers)
- */
-static void tcp_tasklet_func(unsigned long data)
-{
-        struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
-        LIST_HEAD(list);
-        unsigned long flags;
-        struct list_head *q, *n;
-        struct tcp_sock *tp;
-        struct sock *sk;
-        local_irq_save(flags);
-        list_splice_init(&tsq->head, &list);
-        local_irq_restore(flags);
-        list_for_each_safe(q, n, &list) {
-                tp = list_entry(q, struct tcp_sock, tsq_node);
-                list_del(&tp->tsq_node);
-                sk = (struct sock *)tp;
-                bh_lock_sock(sk);
-                if (!sock_owned_by_user(sk)) {
-                        tcp_tsq_handler(sk);
-                } else {
-                        /* defer the work to tcp_release_cb() */
-                        set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
-                }
-                bh_unlock_sock(sk);
-                clear_bit(TSQ_QUEUED, &tp->tsq_flags);
-                sk_free(sk);
-        }
-}
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |           \
-                          (1UL << TCP_WRITE_TIMER_DEFERRED) |   \
-                          (1UL << TCP_DELACK_TIMER_DEFERRED) |  \
-                          (1UL << TCP_MTU_REDUCED_DEFERRED))
-/**
- * tcp_release_cb - tcp release_sock() callback
- * @sk: socket
- *
- * called from release_sock() to perform protocol dependent
- * actions before socket release.
- */
-void tcp_release_cb(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        unsigned long flags, nflags;
-        /* perform an atomic operation only if at least one flag is set */
-        do {
-                flags = tp->tsq_flags;
-                if (!(flags & TCP_DEFERRED_ALL))
-                        return;
-                nflags = flags & ~TCP_DEFERRED_ALL;
-        } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
-        if (flags & (1UL << TCP_TSQ_DEFERRED))
-                tcp_tsq_handler(sk);
-        if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
-                tcp_write_timer_handler(sk);
-                __sock_put(sk);
-        }
-        if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
-                tcp_delack_timer_handler(sk);
-                __sock_put(sk);
-        }
-        if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
-                sk->sk_prot->mtu_reduced(sk);
-                __sock_put(sk);
-        }
-}
-EXPORT_SYMBOL(tcp_release_cb);
-void __init tcp_tasklet_init(void)
-{
-        int i;
-        for_each_possible_cpu(i) {
-                struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
-                INIT_LIST_HEAD(&tsq->head);
-                tasklet_init(&tsq->tasklet,
-                             tcp_tasklet_func,
-                             (unsigned long)tsq);
-        }
-}
-/*
- * Write buffer destructor automatically called from kfree_skb.
- * We cant xmit new skbs from this context, as we might already
- * hold qdisc lock.
- */
-static void tcp_wfree(struct sk_buff *skb)
-{
-        struct sock *sk = skb->sk;
-        struct tcp_sock *tp = tcp_sk(sk);
-        if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
-            !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
-                unsigned long flags;
-                struct tsq_tasklet *tsq;
-                /* Keep a ref on socket.
-                 * This last ref will be released in tcp_tasklet_func()
-                 */
-                atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
-                /* queue this socket to tasklet queue */
-                local_irq_save(flags);
-                tsq = &__get_cpu_var(tsq_tasklet);
-                list_add(&tp->tsq_node, &tsq->head);
-                tasklet_schedule(&tsq->tasklet);
-                local_irq_restore(flags);
-        } else {
-                sock_wfree(skb);
-        }
-}
 /* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
@@ -998,7 +799,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        struct tcp_sock *tp;
        struct tcp_skb_cb *tcb;
        struct tcp_out_options opts;
-        unsigned int tcp_options_size, tcp_header_size;
+        unsigned tcp_options_size, tcp_header_size;
        struct tcp_md5sig_key *md5;
        struct tcphdr *th;
        int err;
@@ -1025,7 +826,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        tcb = TCP_SKB_CB(skb);
        memset(&opts, 0, sizeof(opts));
-        if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+        if (unlikely(tcb->flags & TCPHDR_SYN))
                tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
        else
                tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -1040,12 +841,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
+        skb_set_owner_w(skb, sk);
-        skb_orphan(skb);
-        skb->sk = sk;
-        skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
-                          tcp_wfree : sock_wfree;
-        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
        /* Build TCP header and checksum it. */
        th = tcp_hdr(skb);
@@ -1054,9 +850,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        th->seq                 = htonl(tcb->seq);
        th->ack_seq             = htonl(tp->rcv_nxt);
        *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
-                                        tcb->tcp_flags);
+                                        tcb->flags);
-        if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+        if (unlikely(tcb->flags & TCPHDR_SYN)) {
                /* RFC1323: The window in SYN & SYN/ACK segments
                 * is never scaled.
                 */
@@ -1079,7 +875,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        }
        tcp_options_write((__be32 *)(th + 1), tp, &opts);
-        if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
+        if (likely((tcb->flags & TCPHDR_SYN) == 0))
                TCP_ECN_send(sk, skb, tcp_header_size);
 #ifdef CONFIG_TCP_MD5SIG
@@ -1093,11 +889,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        icsk->icsk_af_ops->send_check(sk, skb);
-        if (likely(tcb->tcp_flags & TCPHDR_ACK))
+        if (likely(tcb->flags & TCPHDR_ACK))
                tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
        if (skb->len != tcp_header_size)
-                tcp_event_data_sent(tp, sk);
+                tcp_event_data_sent(tp, skb, sk);
        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
                TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
@@ -1130,7 +926,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 }
 /* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
                                 unsigned int mss_now)
 {
        if (skb->len <= mss_now || !sk_can_gso(sk) ||
@@ -1151,7 +947,7 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
 /* When a modification to fackets out becomes necessary, we need to check
 * skb is counted to fackets_out or not.
 */
-static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
+static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
                                   int decr)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1166,7 +962,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
 /* Pcount in the middle of the write queue got changed, we need to do various
 * tweaks to fix counters
 */
-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1236,9 +1032,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
        /* PSH and FIN should only be set in the second packet. */
-        flags = TCP_SKB_CB(skb)->tcp_flags;
+        flags = TCP_SKB_CB(skb)->flags;
-        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+        TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
-        TCP_SKB_CB(buff)->tcp_flags = flags;
+        TCP_SKB_CB(buff)->flags = flags;
        TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
        if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -1295,27 +1091,17 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
 {
        int i, k, eat;
-        eat = min_t(int, len, skb_headlen(skb));
-        if (eat) {
-                __skb_pull(skb, eat);
-                skb->avail_size -= eat;
-                len -= eat;
-                if (!len)
-                        return;
-        }
        eat = len;
        k = 0;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+                if (skb_shinfo(skb)->frags[i].size <= eat) {
+                        put_page(skb_shinfo(skb)->frags[i].page);
-                if (size <= eat) {
+                        eat -= skb_shinfo(skb)->frags[i].size;
-                        skb_frag_unref(skb, i);
-                        eat -= size;
                } else {
                        skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
                        if (eat) {
                                skb_shinfo(skb)->frags[k].page_offset += eat;
-                                skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+                                skb_shinfo(skb)->frags[k].size -= eat;
                                eat = 0;
                        }
                        k++;
@@ -1334,7 +1120,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
        if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
                return -ENOMEM;
-        __pskb_trim_head(skb, len);
+        /* If len == headlen, we avoid __skb_pull to preserve alignment. */
+        if (unlikely(len < skb_headlen(skb)))
+                __skb_pull(skb, len);
+        else
+                __pskb_trim_head(skb, len - skb_headlen(skb));
        TCP_SKB_CB(skb)->seq += len;
        skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1354,8 +1144,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 /* Calculate MSS. Not accounting for SACKs here.  */
 int tcp_mtu_to_mss(struct sock *sk, int pmtu)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        int mss_now;
        /* Calculate base mss without TCP options:
@@ -1363,14 +1153,6 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
         */
        mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
-        /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
-        if (icsk->icsk_af_ops->net_frag_header_len) {
-                const struct dst_entry *dst = __sk_dst_get(sk);
-                if (dst && dst_allfrag(dst))
-                        mss_now -= icsk->icsk_af_ops->net_frag_header_len;
-        }
        /* Clamp it (mss_clamp does not include tcp options) */
        if (mss_now > tp->rx_opt.mss_clamp)
                mss_now = tp->rx_opt.mss_clamp;
@@ -1391,8 +1173,8 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
 /* Inverse of above */
 int tcp_mss_to_mtu(struct sock *sk, int mss)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        const struct inet_connection_sock *icsk = inet_csk(sk);
+        struct inet_connection_sock *icsk = inet_csk(sk);
        int mtu;
        mtu = mss +
@@ -1400,13 +1182,6 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
              icsk->icsk_ext_hdr_len +
              icsk->icsk_af_ops->net_header_len;
-        /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
-        if (icsk->icsk_af_ops->net_frag_header_len) {
-                const struct dst_entry *dst = __sk_dst_get(sk);
-                if (dst && dst_allfrag(dst))
-                        mtu += icsk->icsk_af_ops->net_frag_header_len;
-        }
        return mtu;
 }
@@ -1473,10 +1248,10 @@ EXPORT_SYMBOL(tcp_sync_mss);
 */
 unsigned int tcp_current_mss(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        const struct dst_entry *dst = __sk_dst_get(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
        u32 mss_now;
-        unsigned int header_len;
+        unsigned header_len;
        struct tcp_out_options opts;
        struct tcp_md5sig_key *md5;
@@ -1534,22 +1309,22 @@ static void tcp_cwnd_validate(struct sock *sk)
 * modulo only when the receiver window alone is the limiting factor or
 * when we would be allowed to send the split-due-to-Nagle skb fully.
 */
-static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
+static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
-                                        unsigned int mss_now, unsigned int max_segs)
+                                        unsigned int mss_now, unsigned int cwnd)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
-        u32 needed, window, max_len;
+        u32 needed, window, cwnd_len;
        window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
-        max_len = mss_now * max_segs;
+        cwnd_len = mss_now * cwnd;
-        if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+        if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
-                return max_len;
+                return cwnd_len;
        needed = min(skb->len, window);
-        if (max_len <= needed)
+        if (cwnd_len <= needed)
-                return max_len;
+                return cwnd_len;
        return needed - needed % mss_now;
 }
@@ -1557,14 +1332,13 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
 /* Can at least one segment of SKB be sent right now, according to the
 * congestion window rules?  If so, return how many segments are allowed.
 */
-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
-                                         const struct sk_buff *skb)
+                                         struct sk_buff *skb)
 {
        u32 in_flight, cwnd;
        /* Don't be strict about the congestion window for the final FIN.  */
-        if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+        if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
-            tcp_skb_pcount(skb) == 1)
                return 1;
        in_flight = tcp_packets_in_flight(tp);
@@ -1579,7 +1353,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
 * This must be invoked the first time we consider transmitting
 * SKB onto the wire.
 */
-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
                             unsigned int mss_now)
 {
        int tso_segs = tcp_skb_pcount(skb);
@@ -1592,33 +1366,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
 }
 /* Minshall's variant of the Nagle send check. */
-static inline bool tcp_minshall_check(const struct tcp_sock *tp)
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
 {
        return after(tp->snd_sml, tp->snd_una) &&
                !after(tp->snd_sml, tp->snd_nxt);
 }
-/* Return false, if packet can be sent now without violation Nagle's rules:
+/* Return 0, if packet can be sent now without violation Nagle's rules:
 * 1. It is full sized.
 * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 3. Or TCP_NODELAY was set.
 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
 *    With Minshall's modification: all sent small packets are ACKed.
 */
-static inline bool tcp_nagle_check(const struct tcp_sock *tp,
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
                                  const struct sk_buff *skb,
-                                  unsigned int mss_now, int nonagle)
+                                  unsigned mss_now, int nonagle)
 {
        return skb->len < mss_now &&
                ((nonagle & TCP_NAGLE_CORK) ||
                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
 }
-/* Return true if the Nagle test allows this packet to be
+/* Return non-zero if the Nagle test allows this packet to be
 * sent now.
 */
-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
-                                  unsigned int cur_mss, int nonagle)
+                                 unsigned int cur_mss, int nonagle)
 {
        /* Nagle rule does not apply to frames, which sit in the middle of the
         * write_queue (they have no chances to get new data).
@@ -1627,25 +1401,24 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
         * argument based upon the location of SKB in the send queue.
         */
        if (nonagle & TCP_NAGLE_PUSH)
-                return true;
+                return 1;
        /* Don't use the nagle rule for urgent data (or for the final FIN).
         * Nagle can be ignored during F-RTO too (see RFC4138).
         */
        if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
-            (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+            (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
-                return true;
+                return 1;
        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
-                return true;
+                return 1;
-        return false;
+        return 0;
 }
 /* Does at least the first segment of SKB fit into the send window? */
-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
-                             const struct sk_buff *skb,
+                                   unsigned int cur_mss)
-                             unsigned int cur_mss)
 {
        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -1659,10 +1432,10 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
 * should be put on the wire right now.  If so, it returns the number of
 * packets allowed by the congestion window.
 */
-static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
                                 unsigned int cur_mss, int nonagle)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int cwnd_quota;
        tcp_init_tso_segs(sk, skb, cur_mss);
@@ -1678,9 +1451,9 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
 }
 /* Test if sending is allowed right now. */
-bool tcp_may_send_now(struct sock *sk)
+int tcp_may_send_now(struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb = tcp_send_head(sk);
        return skb &&
@@ -1722,9 +1495,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
        /* PSH and FIN should only be set in the second packet. */
-        flags = TCP_SKB_CB(skb)->tcp_flags;
+        flags = TCP_SKB_CB(skb)->flags;
-        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+        TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
-        TCP_SKB_CB(buff)->tcp_flags = flags;
+        TCP_SKB_CB(buff)->flags = flags;
        /* This packet was never sent out yet, so no SACK bits. */
        TCP_SKB_CB(buff)->sacked = 0;
@@ -1748,14 +1521,14 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 *
 * This algorithm is from John Heffner.
 */
-static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 send_win, cong_win, limit, in_flight;
        int win_divisor;
-        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+        if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
                goto send_now;
        if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1778,8 +1551,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
        limit = min(send_win, cong_win);
        /* If a full-sized TSO skb can be sent, do it. */
-        if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
+        if (limit >= sk->sk_gso_max_size)
-                           sk->sk_gso_max_segs * tp->mss_cache))
                goto send_now;
        /* Middle in queue won't get any more data, full sendable already? */
@@ -1802,18 +1574,18 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
                 * frame, so if we have space for more than 3 frames
                 * then send now.
                 */
-                if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
+                if (limit > tcp_max_burst(tp) * tp->mss_cache)
                        goto send_now;
        }
        /* Ok, it looks like it is advisable to defer.  */
        tp->tso_deferred = 1 | (jiffies << 1);
-        return true;
+        return 1;
 send_now:
        tp->tso_deferred = 0;
-        return false;
+        return 0;
 }
 /* Create a new MTU probe if we are ready.
@@ -1883,7 +1655,7 @@ static int tcp_mtu_probe(struct sock *sk)
        TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
-        TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
+        TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
        TCP_SKB_CB(nskb)->sacked = 0;
        nskb->csum = 0;
        nskb->ip_summed = skb->ip_summed;
@@ -1903,11 +1675,11 @@ static int tcp_mtu_probe(struct sock *sk)
                if (skb->len <= copy) {
                        /* We've eaten all the data from this skb.
                         * Throw it away. */
-                        TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+                        TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
                        tcp_unlink_write_queue(skb, sk);
                        sk_wmem_free_skb(sk, skb);
                } else {
-                        TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
+                        TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
                                                   ~(TCPHDR_FIN|TCPHDR_PSH);
                        if (!skb_shinfo(skb)->nr_frags) {
                                skb_pull(skb, copy);
@@ -1955,11 +1727,11 @@ static int tcp_mtu_probe(struct sock *sk)
 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
 * account rare use of URG, this is not a big flaw.
 *
- * Returns true, if no segments are in flight and we have queued segments,
+ * Returns 1, if no segments are in flight and we have queued segments, but
- * but cannot send anything now because of SWS or another problem.
+ * cannot send anything now because of SWS or another problem.
 */
-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
-                           int push_one, gfp_t gfp)
+                          int push_one, gfp_t gfp)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -1973,7 +1745,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                /* Do MTU probing. */
                result = tcp_mtu_probe(sk);
                if (!result) {
-                        return false;
+                        return 0;
                } else if (result > 0) {
                        sent_pkts = 1;
                }
@@ -1982,13 +1754,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
                tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
                BUG_ON(!tso_segs);
-                if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
-                        goto repair; /* Skip network transmission */
                cwnd_quota = tcp_cwnd_test(tp, skb);
                if (!cwnd_quota)
                        break;
@@ -2006,19 +1774,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                break;
                }
-                /* TSQ : sk_wmem_alloc accounts skb truesize,
-                 * including skb overhead. But thats OK.
-                 */
-                if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
-                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
-                        break;
-                }
                limit = mss_now;
                if (tso_segs > 1 && !tcp_urg_mode(tp))
                        limit = tcp_mss_split_point(sk, skb, mss_now,
-                                                    min_t(unsigned int,
+                                                    cwnd_quota);
-                                                          cwnd_quota,
-                                                          sk->sk_gso_max_segs));
                if (skb->len > limit &&
                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -2029,24 +1788,21 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
                        break;
-repair:
                /* Advance the send_head.  This one is sent out.
                 * This call will increment packets_out.
                 */
                tcp_event_new_data_sent(sk, skb);
                tcp_minshall_update(tp, mss_now, skb);
-                sent_pkts += tcp_skb_pcount(skb);
+                sent_pkts++;
                if (push_one)
                        break;
        }
        if (likely(sent_pkts)) {
-                if (tcp_in_cwnd_reduction(sk))
-                        tp->prr_out += sent_pkts;
                tcp_cwnd_validate(sk);
-                return false;
+                return 0;
        }
        return !tp->packets_out && tcp_send_head(sk);
 }
@@ -2065,8 +1821,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
        if (unlikely(sk->sk_state == TCP_CLOSE))
                return;
-        if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+        if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
-                           sk_gfp_atomic(sk, GFP_ATOMIC)))
                tcp_check_probe_timer(sk);
 }
@@ -2155,7 +1910,7 @@ u32 __tcp_select_window(struct sock *sk)
        if (free_space < (full_space >> 1)) {
                icsk->icsk_ack.quick = 0;
-                if (sk_under_memory_pressure(sk))
+                if (tcp_memory_pressure)
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh,
                                               4U * tp->advmss);
@@ -2228,7 +1983,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
        /* Merge over control information. This moves PSH/FIN etc. over */
-        TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
+        TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
        /* All done, get rid of second SKB and account for it so
         * packet counting does not break.
@@ -2246,22 +2001,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 }
 /* Check if coalescing SKBs is legal. */
-static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
+static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
 {
        if (tcp_skb_pcount(skb) > 1)
-                return false;
+                return 0;
        /* TODO: SACK collapsing could be used to remove this condition */
        if (skb_shinfo(skb)->nr_frags != 0)
-                return false;
+                return 0;
        if (skb_cloned(skb))
-                return false;
+                return 0;
        if (skb == tcp_send_head(sk))
-                return false;
+                return 0;
        /* Some heurestics for collapsing over SACK'd could be invented */
        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
-                return false;
+                return 0;
-        return true;
+        return 1;
 }
 /* Collapse packets in the retransmit queue to make to create
@@ -2272,11 +2027,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb = to, *tmp;
-        bool first = true;
+        int first = 1;
        if (!sysctl_tcp_retrans_collapse)
                return;
-        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+        if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
                return;
        tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2286,7 +2041,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
                space -= skb->len;
                if (first) {
-                        first = false;
+                        first = 0;
                        continue;
                }
@@ -2295,7 +2050,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
                /* Punt if not enough space exists in the first SKB for
                 * the data in the second
                 */
-                if (skb->len > skb_availroom(to))
+                if (skb->len > skb_tailroom(to))
                        break;
                if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
@@ -2309,11 +2064,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 * state updates are done by the caller.  Returns non-zero if an
 * error occurred which prevented the send.
 */
-int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        unsigned int cur_mss;
+        int err;
        /* Inconslusive MTU probe */
        if (icsk->icsk_mtup.probe_size) {
@@ -2367,12 +2123,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         * since it is cheap to do so and saves bytes on the network.
         */
        if (skb->len > 0 &&
-            (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+            (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
                if (!pskb_trim(skb, 0)) {
                        /* Reuse, even though it does some unnecessary work */
                        tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
-                                             TCP_SKB_CB(skb)->tcp_flags);
+                                             TCP_SKB_CB(skb)->flags);
                        skb->ip_summed = CHECKSUM_NONE;
                }
        }
@@ -2382,21 +2138,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-        /* make sure skb->data is aligned on arches that require it */
+        err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
-        if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
-                struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
-                                                   GFP_ATOMIC);
-                return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-                              -ENOBUFS;
-        } else {
-                return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
-        }
-}
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int err = __tcp_retransmit_skb(sk, skb);
        if (err == 0) {
                /* Update global TCP statistics. */
@@ -2406,7 +2148,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 #if FASTRETRANS_DEBUG > 0
                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
-                        net_dbg_ratelimited("retrans_out leaked\n");
+                        if (net_ratelimit())
+                                printk(KERN_DEBUG "retrans_out leaked.\n");
                }
 #endif
                if (!tp->retrans_out)
@@ -2431,18 +2174,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 /* Check if we forward retransmits are possible in the current
 * window/congestion state.
 */
-static bool tcp_can_forward_retransmit(struct sock *sk)
+static int tcp_can_forward_retransmit(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
-        const struct tcp_sock *tp = tcp_sk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
        /* Forward retransmissions are possible only during Recovery. */
        if (icsk->icsk_ca_state != TCP_CA_Recovery)
-                return false;
+                return 0;
        /* No forward retransmissions in Reno are possible. */
        if (tcp_is_reno(tp))
-                return false;
+                return 0;
        /* Yeah, we have to make difficult choice between forward transmission
         * and retransmission... Both ways have their merits...
@@ -2453,9 +2196,9 @@ static bool tcp_can_forward_retransmit(struct sock *sk)
         */
        if (tcp_may_send_now(sk))
-                return false;
+                return 0;
-        return true;
+        return 1;
 }
 /* This gets called after a retransmit timeout, and the initially
@@ -2545,15 +2288,10 @@ begin_fwd:
                if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
                        continue;
-                if (tcp_retransmit_skb(sk, skb)) {
+                if (tcp_retransmit_skb(sk, skb))
-                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
                        return;
-                }
                NET_INC_STATS_BH(sock_net(sk), mib_idx);
-                if (tcp_in_cwnd_reduction(sk))
-                        tp->prr_out += tcp_skb_pcount(skb);
                if (skb == tcp_write_queue_head(sk))
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                  inet_csk(sk)->icsk_rto,
@@ -2577,7 +2315,7 @@ void tcp_send_fin(struct sock *sk)
        mss_now = tcp_current_mss(sk);
        if (tcp_send_head(sk) != NULL) {
-                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
+                TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
                TCP_SKB_CB(skb)->end_seq++;
                tp->write_seq++;
        } else {
@@ -2639,11 +2377,11 @@ int tcp_send_synack(struct sock *sk)
        struct sk_buff *skb;
        skb = tcp_write_queue_head(sk);
-        if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+        if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
-                pr_debug("%s: wrong queue state\n", __func__);
+                printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
                return -EFAULT;
        }
-        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
+        if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
                if (skb_cloned(skb)) {
                        struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
                        if (nskb == NULL)
@@ -2657,27 +2395,17 @@ int tcp_send_synack(struct sock *sk)
                        skb = nskb;
                }
-                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
+                TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
                TCP_ECN_send_synack(tcp_sk(sk), skb);
        }
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
-/**
+/* Prepare a SYN-ACK. */
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- * rvp: request_values pointer
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
- */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
-                                struct request_values *rvp,
+                                struct request_values *rvp)
-                                struct tcp_fastopen_cookie *foc)
 {
        struct tcp_out_options opts;
        struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2693,16 +2421,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
                s_data_desired = cvp->s_data_desired;
-        skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
+        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
-                        sk_gfp_atomic(sk, GFP_ATOMIC));
+        if (skb == NULL)
-        if (unlikely(!skb)) {
-                dst_release(dst);
                return NULL;
-        }
        /* Reserve space for headers. */
        skb_reserve(skb, MAX_TCP_HEADER);
-        skb_dst_set(skb, dst);
+        skb_dst_set(skb, dst_clone(dst));
        mss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2737,7 +2463,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 #endif
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        tcp_header_size = tcp_synack_options(sk, req, mss,
-                                             skb, &opts, &md5, xvp, foc)
+                                             skb, &opts, &md5, xvp)
                        + sizeof(*th);
        skb_push(skb, tcp_header_size);
@@ -2791,8 +2517,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        }
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
-        /* XXX data is queued and acked as is. No buffer/window check */
+        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
-        th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rcv_wnd, 65535U));
@@ -2813,9 +2538,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 EXPORT_SYMBOL(tcp_make_synack);
 /* Do all connect socket setups that can be done AF independent. */
-void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
 {
-        const struct dst_entry *dst = __sk_dst_get(sk);
+        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __u8 rcv_wscale;
@@ -2868,121 +2593,15 @@ void tcp_connect_init(struct sock *sk)
        tp->snd_una = tp->write_seq;
        tp->snd_sml = tp->write_seq;
        tp->snd_up = tp->write_seq;
-        tp->snd_nxt = tp->write_seq;
+        tp->rcv_nxt = 0;
+        tp->rcv_wup = 0;
-        if (likely(!tp->repair))
+        tp->copied_seq = 0;
-                tp->rcv_nxt = 0;
-        tp->rcv_wup = tp->rcv_nxt;
-        tp->copied_seq = tp->rcv_nxt;
        inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
        inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
 }
-static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-        tcb->end_seq += skb->len;
-        skb_header_release(skb);
-        __tcp_add_write_queue_tail(sk, skb);
-        sk->sk_wmem_queued += skb->truesize;
-        sk_mem_charge(sk, skb->truesize);
-        tp->write_seq = tcb->end_seq;
-        tp->packets_out += tcp_skb_pcount(skb);
-}
-/* Build and send a SYN with data and (cached) Fast Open cookie. However,
- * queue a data-only packet after the regular SYN, such that regular SYNs
- * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
- * only the SYN sequence, the data are retransmitted in the first ACK.
- * If cookie is not cached or other error occurs, falls back to send a
- * regular SYN with Fast Open cookie request option.
- */
-static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        struct tcp_fastopen_request *fo = tp->fastopen_req;
-        int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
-        struct sk_buff *syn_data = NULL, *data;
-        unsigned long last_syn_loss = 0;
-        tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
-        tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
-                               &syn_loss, &last_syn_loss);
-        /* Recurring FO SYN losses: revert to regular handshake temporarily */
-        if (syn_loss > 1 &&
-            time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
-                fo->cookie.len = -1;
-                goto fallback;
-        }
-        if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
-                fo->cookie.len = -1;
-        else if (fo->cookie.len <= 0)
-                goto fallback;
-        /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
-         * user-MSS. Reserve maximum option space for middleboxes that add
-         * private TCP options. The cost is reduced data space in SYN :(
-         */
-        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
-                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
-        space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
-                MAX_TCP_OPTION_SPACE;
-        syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
-                                   sk->sk_allocation);
-        if (syn_data == NULL)
-                goto fallback;
-        for (i = 0; i < iovlen && syn_data->len < space; ++i) {
-                struct iovec *iov = &fo->data->msg_iov[i];
-                unsigned char __user *from = iov->iov_base;
-                int len = iov->iov_len;
-                if (syn_data->len + len > space)
-                        len = space - syn_data->len;
-                else if (i + 1 == iovlen)
-                        /* No more data pending in inet_wait_for_connect() */
-                        fo->data = NULL;
-                if (skb_add_data(syn_data, from, len))
-                        goto fallback;
-        }
-        /* Queue a data-only packet after the regular SYN for retransmission */
-        data = pskb_copy(syn_data, sk->sk_allocation);
-        if (data == NULL)
-                goto fallback;
-        TCP_SKB_CB(data)->seq++;
-        TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
-        TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
-        tcp_connect_queue_skb(sk, data);
-        fo->copied = data->len;
-        if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
-                tp->syn_data = (fo->copied > 0);
-                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
-                goto done;
-        }
-        syn_data = NULL;
-fallback:
-        /* Send a regular SYN with Fast Open cookie request option */
-        if (fo->cookie.len > 0)
-                fo->cookie.len = 0;
-        err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
-        if (err)
-                tp->syn_fastopen = 0;
-        kfree_skb(syn_data);
-done:
-        fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
-        return err;
-}
 /* Build a SYN and send it off. */
 int tcp_connect(struct sock *sk)
 {
@@ -2992,11 +2611,6 @@ int tcp_connect(struct sock *sk)
        tcp_connect_init(sk);
-        if (unlikely(tp->repair)) {
-                tcp_finish_connect(sk, NULL);
-                return 0;
-        }
        buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;
@@ -3004,14 +2618,19 @@ int tcp_connect(struct sock *sk)
        /* Reserve space for headers. */
        skb_reserve(buff, MAX_TCP_HEADER);
+        tp->snd_nxt = tp->write_seq;
        tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
-        tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
-        tcp_connect_queue_skb(sk, buff);
        TCP_ECN_send_syn(sk, buff);
-        /* Send off SYN; include data in Fast Open. */
+        /* Send it off. */
-        err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+        TCP_SKB_CB(buff)->when = tcp_time_stamp;
-              tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+        skb_header_release(buff);
+        __tcp_add_write_queue_tail(sk, buff);
+        sk->sk_wmem_queued += buff->truesize;
+        sk_mem_charge(sk, buff->truesize);
+        tp->packets_out += tcp_skb_pcount(buff);
+        err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
        if (err == -ECONNREFUSED)
                return err;
@@ -3098,7 +2717,7 @@ void tcp_send_ack(struct sock *sk)
         * tcp_transmit_skb() will set the ownership to this
         * sock.
         */
-        buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+        buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
        if (buff == NULL) {
                inet_csk_schedule_ack(sk);
                inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3113,7 +2732,7 @@ void tcp_send_ack(struct sock *sk)
        /* Send it off, this clears delayed acks for us. */
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
-        tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+        tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }
 /* This routine sends a packet with an out of date sequence
@@ -3133,7 +2752,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
        struct sk_buff *skb;
        /* We don't queue it, tcp_transmit_skb() sets ownership. */
-        skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
        if (skb == NULL)
                return -1;
@@ -3148,15 +2767,6 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
        return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
-void tcp_send_window_probe(struct sock *sk)
-{
-        if (sk->sk_state == TCP_ESTABLISHED) {
-                tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
-                tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
-                tcp_xmit_probe_skb(sk, 0);
-        }
-}
 /* Initiate keepalive or window probe from timer. */
 int tcp_write_wakeup(struct sock *sk)
 {
@@ -3182,13 +2792,13 @@ int tcp_write_wakeup(struct sock *sk)
                if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
                    skb->len > mss) {
                        seg_size = min(seg_size, mss);
-                        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+                        TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
                        if (tcp_fragment(sk, skb, seg_size, mss))
                                return -1;
                } else if (!tcp_skb_pcount(skb))
                        tcp_set_skb_tso_segs(sk, skb, mss);
-                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+                TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
                err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
                if (!err)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 4526fe68e60..85ee7eb7e38 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -18,8 +18,6 @@
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/socket.h>
@@ -91,7 +89,7 @@ static inline int tcp_probe_avail(void)
 * Note: arguments must match tcp_rcv_established()!
 */
 static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-                               struct tcphdr *th, unsigned int len)
+                               struct tcphdr *th, unsigned len)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_sock *inet = inet_sk(sk);
@@ -138,7 +136,7 @@ static struct jprobe tcp_jprobe = {
        .entry  = jtcp_rcv_established,
 };
-static int tcpprobe_open(struct inode *inode, struct file *file)
+static int tcpprobe_open(struct inode * inode, struct file * file)
 {
        /* Reset (empty) log */
        spin_lock_bh(&tcp_probe.lock);
@@ -241,7 +239,7 @@ static __init int tcpprobe_init(void)
        if (ret)
                goto err1;
-        pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize);
+        pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
        return 0;
 err1:
        proc_net_remove(&init_net, procname);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac30c49..ecd44b0c45f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,6 +32,17 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
 int sysctl_tcp_orphan_retries __read_mostly;
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
+static void tcp_write_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
+void tcp_init_xmit_timers(struct sock *sk)
+{
+        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+                                  &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
 static void tcp_write_err(struct sock *sk)
 {
        sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -66,7 +77,10 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
        if (sk->sk_err_soft)
                shift++;
-        if (tcp_check_oom(sk, shift)) {
+        if (tcp_too_many_orphans(sk, shift)) {
+                if (net_ratelimit())
+                        printk(KERN_INFO "Out of socket memory\n");
                /* Catch exceptional cases, when connection requires reset.
                 *      1. Last segment was sent recently. */
                if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
@@ -157,13 +171,13 @@ static int tcp_write_timeout(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        int retry_until;
-        bool do_reset, syn_set = false;
+        bool do_reset, syn_set = 0;
        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                if (icsk->icsk_retransmits)
                        dst_negative_advice(sk);
                retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
-                syn_set = true;
+                syn_set = 1;
        } else {
                if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
                        /* Black hole detection */
@@ -194,11 +208,21 @@ static int tcp_write_timeout(struct sock *sk)
        return 0;
 }
-void tcp_delack_timer_handler(struct sock *sk)
+static void tcp_delack_timer(unsigned long data)
 {
+        struct sock *sk = (struct sock *)data;
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later. */
+                icsk->icsk_ack.blocked = 1;
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+                sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
+                goto out_unlock;
+        }
        sk_mem_reclaim_partial(sk);
        if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -237,24 +261,9 @@ void tcp_delack_timer_handler(struct sock *sk)
        }
 out:
-        if (sk_under_memory_pressure(sk))
+        if (tcp_memory_pressure)
                sk_mem_reclaim(sk);
-}
+out_unlock:
-static void tcp_delack_timer(unsigned long data)
-{
-        struct sock *sk = (struct sock *)data;
-        bh_lock_sock(sk);
-        if (!sock_owned_by_user(sk)) {
-                tcp_delack_timer_handler(sk);
-        } else {
-                inet_csk(sk)->icsk_ack.blocked = 1;
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
-                /* deleguate our work to tcp_release_cb() */
-                if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
-                        sock_hold(sk);
-        }
        bh_unlock_sock(sk);
        sock_put(sk);
 }
@@ -305,35 +314,6 @@ static void tcp_probe_timer(struct sock *sk)
 }
 /*
- *      Timer for Fast Open socket to retransmit SYNACK. Note that the
- *      sk here is the child socket, not the parent (listener) socket.
- */
-static void tcp_fastopen_synack_timer(struct sock *sk)
-{
-        struct inet_connection_sock *icsk = inet_csk(sk);
-        int max_retries = icsk->icsk_syn_retries ? :
-            sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
-        struct request_sock *req;
-        req = tcp_sk(sk)->fastopen_rsk;
-        req->rsk_ops->syn_ack_timeout(sk, req);
-        if (req->num_timeout >= max_retries) {
-                tcp_write_err(sk);
-                return;
-        }
-        /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
-         * returned from rtx_syn_ack() to make it more persistent like
-         * regular retransmit because if the child socket has been accepted
-         * it's not good to give up too easily.
-         */
-        inet_rtx_syn_ack(sk, req);
-        req->num_timeout++;
-        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                          TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
-}
-/*
 *      The TCP retransmit timer.
 */
@@ -342,19 +322,6 @@ void tcp_retransmit_timer(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
-        if (tp->early_retrans_delayed) {
-                tcp_resume_early_retransmit(sk);
-                return;
-        }
-        if (tp->fastopen_rsk) {
-                WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
-                             sk->sk_state != TCP_FIN_WAIT1);
-                tcp_fastopen_synack_timer(sk);
-                /* Before we receive ACK to our SYN-ACK don't retransmit
-                 * anything else (e.g., data or FIN segments).
-                 */
-                return;
-        }
        if (!tp->packets_out)
                goto out;
@@ -367,22 +334,22 @@ void tcp_retransmit_timer(struct sock *sk)
                 * connection. If the socket is an orphan, time it out,
                 * we cannot allow such beasts to hang infinitely.
                 */
+#ifdef TCP_DEBUG
                struct inet_sock *inet = inet_sk(sk);
                if (sk->sk_family == AF_INET) {
-                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
+                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
-                                       &inet->inet_daddr,
+                               &inet->inet_daddr, ntohs(inet->inet_dport),
-                                       ntohs(inet->inet_dport), inet->inet_num,
+                               inet->inet_num, tp->snd_una, tp->snd_nxt);
-                                       tp->snd_una, tp->snd_nxt);
                }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
                else if (sk->sk_family == AF_INET6) {
                        struct ipv6_pinfo *np = inet6_sk(sk);
-                        LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
+                        LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
-                                       &np->daddr,
+                               &np->daddr, ntohs(inet->inet_dport),
-                                       ntohs(inet->inet_dport), inet->inet_num,
+                               inet->inet_num, tp->snd_una, tp->snd_nxt);
-                                       tp->snd_una, tp->snd_nxt);
                }
 #endif
+#endif
                if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
                        tcp_write_err(sk);
                        goto out;
@@ -481,11 +448,19 @@ out_reset_timer:
 out:;
 }
-void tcp_write_timer_handler(struct sock *sk)
+static void tcp_write_timer(unsigned long data)
 {
+        struct sock *sk = (struct sock *)data;
        struct inet_connection_sock *icsk = inet_csk(sk);
        int event;
+        bh_lock_sock(sk);
+        if (sock_owned_by_user(sk)) {
+                /* Try again later */
+                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
+                goto out_unlock;
+        }
        if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
                goto out;
@@ -508,20 +483,7 @@ void tcp_write_timer_handler(struct sock *sk)
 out:
        sk_mem_reclaim(sk);
-}
+out_unlock:
-static void tcp_write_timer(unsigned long data)
-{
-        struct sock *sk = (struct sock *)data;
-        bh_lock_sock(sk);
-        if (!sock_owned_by_user(sk)) {
-                tcp_write_timer_handler(sk);
-        } else {
-                /* deleguate our work to tcp_release_cb() */
-                if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
-                        sock_hold(sk);
-        }
        bh_unlock_sock(sk);
        sock_put(sk);
 }
@@ -638,10 +600,3 @@ out:
        bh_unlock_sock(sk);
        sock_put(sk);
 }
-void tcp_init_xmit_timers(struct sock *sk)
-{
-        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
-                                  &tcp_keepalive_timer);
-}
-EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 0d017183062..ac3b3ee4b07 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -105,7 +105,7 @@ drop:
        return 0;
 }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static int tunnel64_rcv(struct sk_buff *skb)
 {
        struct xfrm_tunnel *handler;
@@ -134,7 +134,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
                        break;
 }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static void tunnel64_err(struct sk_buff *skb, u32 info)
 {
        struct xfrm_tunnel *handler;
@@ -152,7 +152,7 @@ static const struct net_protocol tunnel4_protocol = {
        .netns_ok       =       1,
 };
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static const struct net_protocol tunnel64_protocol = {
        .handler        =       tunnel64_rcv,
        .err_handler    =       tunnel64_err,
@@ -164,12 +164,12 @@ static const struct net_protocol tunnel64_protocol = {
 static int __init tunnel4_init(void)
 {
        if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
-                pr_err("%s: can't add protocol\n", __func__);
+                printk(KERN_ERR "tunnel4 init: can't add protocol\n");
                return -EAGAIN;
        }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
-                pr_err("tunnel64 init: can't add protocol\n");
+                printk(KERN_ERR "tunnel64 init: can't add protocol\n");
                inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
                return -EAGAIN;
        }
@@ -179,12 +179,12 @@ static int __init tunnel4_init(void)
 static void __exit tunnel4_fini(void)
 {
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
-                pr_err("tunnel64 close: can't remove protocol\n");
+                printk(KERN_ERR "tunnel64 close: can't remove protocol\n");
 #endif
        if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
-                pr_err("tunnel4 close: can't remove protocol\n");
+                printk(KERN_ERR "tunnel4 close: can't remove protocol\n");
 }
 module_init(tunnel4_init);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 79c8dbe59b5..1b5a19340a9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -77,8 +77,7 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define pr_fmt(fmt) "UDP: " fmt
+#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 #include <linux/bootmem.h>
@@ -107,8 +106,6 @@
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <trace/events/udp.h>
-#include <linux/static_key.h>
-#include <trace/events/skb.h>
 #include "udp_impl.h"
 struct udp_table udp_table __read_mostly;
@@ -208,7 +205,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
        if (!snum) {
                int low, high, remaining;
-                unsigned int rand;
+                unsigned rand;
                unsigned short first, last;
                DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
@@ -448,7 +445,7 @@ exact_match:
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 * harder than this. -DaveM
 */
-struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
                __be16 sport, __be32 daddr, __be16 dport,
                int dif, struct udp_table *udptable)
 {
@@ -515,7 +512,6 @@ begin:
        rcu_read_unlock();
        return result;
 }
-EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
 static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
                                                 __be16 sport, __be16 dport,
@@ -616,7 +612,6 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
                break;
        case ICMP_DEST_UNREACH:
                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
-                        ipv4_sk_update_pmtu(skb, sk, info);
                        if (inet->pmtudisc != IP_PMTUDISC_DONT) {
                                err = EMSGSIZE;
                                harderr = 1;
@@ -630,9 +625,6 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
                        err = icmp_err_convert[code].errno;
                }
                break;
-        case ICMP_REDIRECT:
-                ipv4_sk_redirect(skb, sk);
-                break;
        }
        /*
@@ -758,7 +750,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
                uh->check = CSUM_MANGLED_0;
 send:
-        err = ip_send_skb(sock_net(sk), skb);
+        err = ip_send_skb(skb);
        if (err) {
                if (err == -ENOBUFS && !inet->recverr) {
                        UDP_INC_STATS_USER(sock_net(sk),
@@ -852,7 +844,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
         *      Get and verify the address.
         */
        if (msg->msg_name) {
-                struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+                struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
                if (msg->msg_namelen < sizeof(*usin))
                        return -EINVAL;
                if (usin->sin_family != AF_INET) {
@@ -924,8 +916,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                if (!saddr)
                        saddr = inet->mc_addr;
                connected = 0;
-        } else if (!ipc.oif)
+        }
-                ipc.oif = inet->uc_index;
        if (connected)
                rt = (struct rtable *)sk_dst_check(sk, 0);
@@ -982,7 +973,7 @@ back_from_confirm:
                /* ... which is an evident application bug. --ANK */
                release_sock(sk);
-                LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
                err = -EINVAL;
                goto out;
        }
@@ -1061,7 +1052,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
        if (unlikely(!up->pending)) {
                release_sock(sk);
-                LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n"));
+                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
                return -EINVAL;
        }
@@ -1173,8 +1164,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        struct inet_sock *inet = inet_sk(sk);
        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
        struct sk_buff *skb;
-        unsigned int ulen, copied;
+        unsigned int ulen;
-        int peeked, off = 0;
+        int peeked;
        int err;
        int is_udplite = IS_UDPLITE(sk);
        bool slow;
@@ -1190,15 +1181,14 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 try_again:
        skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-                                  &peeked, &off, &err);
+                                  &peeked, &err);
        if (!skb)
                goto out;
        ulen = skb->len - sizeof(struct udphdr);
-        copied = len;
+        if (len > ulen)
-        if (copied > ulen)
+                len = ulen;
-                copied = ulen;
+        else if (len < ulen)
-        else if (copied < ulen)
                msg->msg_flags |= MSG_TRUNC;
        /*
@@ -1207,14 +1197,14 @@ try_again:
         * coverage checksum (UDP-Lite), do it before the copy.
         */
-        if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+        if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
                if (udp_lib_checksum_complete(skb))
                        goto csum_copy_err;
        }
        if (skb_csum_unnecessary(skb))
                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-                                              msg->msg_iov, copied);
+                                              msg->msg_iov, len);
        else {
                err = skb_copy_and_csum_datagram_iovec(skb,
                                                       sizeof(struct udphdr),
@@ -1224,15 +1214,8 @@ try_again:
                        goto csum_copy_err;
        }
-        if (unlikely(err)) {
+        if (err)
-                trace_kfree_skb(skb, udp_recvmsg);
-                if (!peeked) {
-                        atomic_inc(&sk->sk_drops);
-                        UDP_INC_STATS_USER(sock_net(sk),
-                                           UDP_MIB_INERRORS, is_udplite);
-                }
                goto out_free;
-        }
        if (!peeked)
                UDP_INC_STATS_USER(sock_net(sk),
@@ -1250,7 +1233,7 @@ try_again:
        if (inet->cmsg_flags)
                ip_cmsg_recv(msg, skb);
-        err = copied;
+        err = len;
        if (flags & MSG_TRUNC)
                err = ulen;
@@ -1284,7 +1267,7 @@ int udp_disconnect(struct sock *sk, int flags)
        sk->sk_state = TCP_CLOSE;
        inet->inet_daddr = 0;
        inet->inet_dport = 0;
-        sock_rps_reset_rxhash(sk);
+        sock_rps_save_rxhash(sk, 0);
        sk->sk_bound_dev_if = 0;
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
                inet_reset_saddr(sk);
@@ -1372,9 +1355,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
        int rc;
        if (inet_sk(sk)->inet_daddr)
-                sock_rps_save_rxhash(sk, skb);
+                sock_rps_save_rxhash(sk, skb->rxhash);
-        rc = sock_queue_rcv_skb(sk, skb);
+        rc = ip_queue_rcv_skb(sk, skb);
        if (rc < 0) {
                int is_udplite = IS_UDPLITE(sk);
@@ -1392,14 +1375,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 }
-static struct static_key udp_encap_needed __read_mostly;
-void udp_encap_enable(void)
-{
-        if (!static_key_enabled(&udp_encap_needed))
-                static_key_slow_inc(&udp_encap_needed);
-}
-EXPORT_SYMBOL(udp_encap_enable);
 /* returns:
 *  -1: error
 *   0: success
@@ -1421,9 +1396,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                goto drop;
        nf_reset(skb);
-        if (static_key_false(&udp_encap_needed) && up->encap_type) {
+        if (up->encap_type) {
-                int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
                /*
                 * This is an encapsulation socket so pass the skb to
                 * the socket's udp_encap_rcv() hook. Otherwise, just
@@ -1436,11 +1409,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                 */
                /* if we're overly short, let UDP handle it */
-                encap_rcv = ACCESS_ONCE(up->encap_rcv);
+                if (skb->len > sizeof(struct udphdr) &&
-                if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) {
+                    up->encap_rcv != NULL) {
                        int ret;
-                        ret = encap_rcv(sk, skb);
+                        ret = (*up->encap_rcv)(sk, skb);
                        if (ret <= 0) {
                                UDP_INC_STATS_BH(sock_net(sk),
                                                 UDP_MIB_INDATAGRAMS,
@@ -1469,8 +1442,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                 * provided by the application."
                 */
                if (up->pcrlen == 0) {          /* full coverage was set  */
-                        LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n",
+                        LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
-                                       UDP_SKB_CB(skb)->cscov, skb->len);
+                                "%d while full coverage %d requested\n",
+                                UDP_SKB_CB(skb)->cscov, skb->len);
                        goto drop;
                }
                /* The next case involves violating the min. coverage requested
@@ -1480,27 +1454,28 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                 * Therefore the above ...()->partial_cov statement is essential.
                 */
                if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
-                        LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n",
+                        LIMIT_NETDEBUG(KERN_WARNING
-                                       UDP_SKB_CB(skb)->cscov, up->pcrlen);
+                                "UDPLITE: coverage %d too small, need min %d\n",
+                                UDP_SKB_CB(skb)->cscov, up->pcrlen);
                        goto drop;
                }
        }
-        if (rcu_access_pointer(sk->sk_filter) &&
+        if (rcu_dereference_raw(sk->sk_filter)) {
-            udp_lib_checksum_complete(skb))
+                if (udp_lib_checksum_complete(skb))
-                goto drop;
+                        goto drop;
+        }
-        if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf))
+        if (sk_rcvqueues_full(sk, skb))
                goto drop;
        rc = 0;
-        ipv4_pktinfo_prepare(skb);
        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk))
                rc = __udp_queue_rcv_skb(sk, skb);
-        else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+        else if (sk_add_backlog(sk, skb)) {
                bh_unlock_sock(sk);
                goto drop;
        }
@@ -1709,10 +1684,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 short_packet:
        LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
-                       proto == IPPROTO_UDPLITE ? "Lite" : "",
+                       proto == IPPROTO_UDPLITE ? "-Lite" : "",
-                       &saddr, ntohs(uh->source),
+                       &saddr,
-                       ulen, skb->len,
+                       ntohs(uh->source),
-                       &daddr, ntohs(uh->dest));
+                       ulen,
+                       skb->len,
+                       &daddr,
+                       ntohs(uh->dest));
        goto drop;
 csum_error:
@@ -1721,8 +1699,11 @@ csum_error:
         * the network is concerned, anyway) as per 4.1.3.4 (MUST).
         */
        LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
-                       proto == IPPROTO_UDPLITE ? "Lite" : "",
+                       proto == IPPROTO_UDPLITE ? "-Lite" : "",
-                       &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
+                       &saddr,
+                       ntohs(uh->source),
+                       &daddr,
+                       ntohs(uh->dest),
                       ulen);
 drop:
        UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
@@ -1781,7 +1762,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
                        /* FALLTHROUGH */
                case UDP_ENCAP_L2TPINUDP:
                        up->encap_type = val;
-                        udp_encap_enable();
                        break;
                default:
                        err = -ENOPROTOOPT;
@@ -2058,7 +2038,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
                spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
 }
-int udp_seq_open(struct inode *inode, struct file *file)
+static int udp_seq_open(struct inode *inode, struct file *file)
 {
        struct udp_seq_afinfo *afinfo = PDE(inode)->data;
        struct udp_iter_state *s;
@@ -2074,7 +2054,6 @@ int udp_seq_open(struct inode *inode, struct file *file)
        s->udp_table            = afinfo->udp_table;
        return err;
 }
-EXPORT_SYMBOL(udp_seq_open);
 /* ------------------------------------------------------------------------ */
 int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
@@ -2082,12 +2061,17 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
        struct proc_dir_entry *p;
        int rc = 0;
+        afinfo->seq_fops.open           = udp_seq_open;
+        afinfo->seq_fops.read           = seq_read;
+        afinfo->seq_fops.llseek         = seq_lseek;
+        afinfo->seq_fops.release        = seq_release_net;
        afinfo->seq_ops.start           = udp_seq_start;
        afinfo->seq_ops.next            = udp_seq_next;
        afinfo->seq_ops.stop            = udp_seq_stop;
        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
-                             afinfo->seq_fops, afinfo);
+                             &afinfo->seq_fops, afinfo);
        if (!p)
                rc = -ENOMEM;
        return rc;
@@ -2115,9 +2099,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
                bucket, src, srcp, dest, destp, sp->sk_state,
                sk_wmem_alloc_get(sp),
                sk_rmem_alloc_get(sp),
-                0, 0L, 0,
+                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
-                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
-                0, sock_i_ino(sp),
                atomic_read(&sp->sk_refcnt), sp,
                atomic_read(&sp->sk_drops), len);
 }
@@ -2139,20 +2121,14 @@ int udp4_seq_show(struct seq_file *seq, void *v)
        return 0;
 }
-static const struct file_operations udp_afinfo_seq_fops = {
-        .owner    = THIS_MODULE,
-        .open     = udp_seq_open,
-        .read     = seq_read,
-        .llseek   = seq_lseek,
-        .release  = seq_release_net
-};
 /* ------------------------------------------------------------------------ */
 static struct udp_seq_afinfo udp4_seq_afinfo = {
        .name           = "udp",
        .family         = AF_INET,
        .udp_table      = &udp_table,
-        .seq_fops       = &udp_afinfo_seq_fops,
+        .seq_fops       = {
+                .owner  =       THIS_MODULE,
+        },
        .seq_ops        = {
                .show           = udp4_seq_show,
        },
@@ -2187,15 +2163,9 @@ void udp4_proc_exit(void)
 static __initdata unsigned long uhash_entries;
 static int __init set_uhash_entries(char *str)
 {
-        ssize_t ret;
        if (!str)
                return 0;
+        uhash_entries = simple_strtoul(str, &str, 0);
-        ret = kstrtoul(str, 0, &uhash_entries);
-        if (ret)
-                return 0;
        if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
                uhash_entries = UDP_HTABLE_SIZE_MIN;
        return 1;
@@ -2206,16 +2176,26 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 {
        unsigned int i;
-        table->hash = alloc_large_system_hash(name,
+        if (!CONFIG_BASE_SMALL)
-                                              2 * sizeof(struct udp_hslot),
+                table->hash = alloc_large_system_hash(name,
-                                              uhash_entries,
+                        2 * sizeof(struct udp_hslot),
-                                              21, /* one slot per 2 MB */
+                        uhash_entries,
-                                              0,
+                        21, /* one slot per 2 MB */
-                                              &table->log,
+                        0,
-                                              &table->mask,
+                        &table->log,
-                                              UDP_HTABLE_SIZE_MIN,
+                        &table->mask,
-                                              64 * 1024);
+                        64 * 1024);
+        /*
+         * Make sure hash table has the minimum size
+         */
+        if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
+                table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
+                                      2 * sizeof(struct udp_hslot), GFP_KERNEL);
+                if (!table->hash)
+                        panic(name);
+                table->log = ilog2(UDP_HTABLE_SIZE_MIN);
+                table->mask = UDP_HTABLE_SIZE_MIN - 1;
+        }
        table->hash2 = table->hash + (table->mask + 1);
        for (i = 0; i <= table->mask; i++) {
                INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
@@ -2263,8 +2243,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
        return 0;
 }
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
-        netdev_features_t features)
 {
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        unsigned int mss;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
deleted file mode 100644
index 505b30ad918..00000000000
--- a/net/ipv4/udp_diag.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * udp_diag.c   Module for monitoring UDP transport protocols sockets.
- *
- * Authors:     Pavel Emelyanov, <xemul@parallels.com>
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/inet_diag.h>
-#include <linux/udp.h>
-#include <net/udp.h>
-#include <net/udplite.h>
-#include <linux/sock_diag.h>
-static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
-                struct netlink_callback *cb, struct inet_diag_req_v2 *req,
-                struct nlattr *bc)
-{
-        if (!inet_diag_bc_sk(bc, sk))
-                return 0;
-        return inet_sk_diag_fill(sk, NULL, skb, req,
-                        sk_user_ns(NETLINK_CB(cb->skb).ssk),
-                        NETLINK_CB(cb->skb).portid,
-                        cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
-}
-static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
-                const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
-{
-        int err = -EINVAL;
-        struct sock *sk;
-        struct sk_buff *rep;
-        struct net *net = sock_net(in_skb->sk);
-        if (req->sdiag_family == AF_INET)
-                sk = __udp4_lib_lookup(net,
-                                req->id.idiag_src[0], req->id.idiag_sport,
-                                req->id.idiag_dst[0], req->id.idiag_dport,
-                                req->id.idiag_if, tbl);
-#if IS_ENABLED(CONFIG_IPV6)
-        else if (req->sdiag_family == AF_INET6)
-                sk = __udp6_lib_lookup(net,
-                                (struct in6_addr *)req->id.idiag_src,
-                                req->id.idiag_sport,
-                                (struct in6_addr *)req->id.idiag_dst,
-                                req->id.idiag_dport,
-                                req->id.idiag_if, tbl);
-#endif
-        else
-                goto out_nosk;
-        err = -ENOENT;
-        if (sk == NULL)
-                goto out_nosk;
-        err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
-        if (err)
-                goto out;
-        err = -ENOMEM;
-        rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
-                                     sizeof(struct inet_diag_meminfo) +
-                                     64)), GFP_KERNEL);
-        if (!rep)
-                goto out;
-        err = inet_sk_diag_fill(sk, NULL, rep, req,
-                           sk_user_ns(NETLINK_CB(in_skb).ssk),
-                           NETLINK_CB(in_skb).portid,
-                           nlh->nlmsg_seq, 0, nlh);
-        if (err < 0) {
-                WARN_ON(err == -EMSGSIZE);
-                kfree_skb(rep);
-                goto out;
-        }
-        err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-                              MSG_DONTWAIT);
-        if (err > 0)
-                err = 0;
-out:
-        if (sk)
-                sock_put(sk);
-out_nosk:
-        return err;
-}
-static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb,
-                struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
-        int num, s_num, slot, s_slot;
-        struct net *net = sock_net(skb->sk);
-        s_slot = cb->args[0];
-        num = s_num = cb->args[1];
-        for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) {
-                struct sock *sk;
-                struct hlist_nulls_node *node;
-                struct udp_hslot *hslot = &table->hash[slot];
-                if (hlist_nulls_empty(&hslot->head))
-                        continue;
-                spin_lock_bh(&hslot->lock);
-                sk_nulls_for_each(sk, node, &hslot->head) {
-                        struct inet_sock *inet = inet_sk(sk);
-                        if (!net_eq(sock_net(sk), net))
-                                continue;
-                        if (num < s_num)
-                                goto next;
-                        if (!(r->idiag_states & (1 << sk->sk_state)))
-                                goto next;
-                        if (r->sdiag_family != AF_UNSPEC &&
-                                        sk->sk_family != r->sdiag_family)
-                                goto next;
-                        if (r->id.idiag_sport != inet->inet_sport &&
-                            r->id.idiag_sport)
-                                goto next;
-                        if (r->id.idiag_dport != inet->inet_dport &&
-                            r->id.idiag_dport)
-                                goto next;
-                        if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
-                                spin_unlock_bh(&hslot->lock);
-                                goto done;
-                        }
-next:
-                        num++;
-                }
-                spin_unlock_bh(&hslot->lock);
-        }
-done:
-        cb->args[0] = slot;
-        cb->args[1] = num;
-}
-static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-                struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
-        udp_dump(&udp_table, skb, cb, r, bc);
-}
-static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
-                struct inet_diag_req_v2 *req)
-{
-        return udp_dump_one(&udp_table, in_skb, nlh, req);
-}
-static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
-                void *info)
-{
-        r->idiag_rqueue = sk_rmem_alloc_get(sk);
-        r->idiag_wqueue = sk_wmem_alloc_get(sk);
-}
-static const struct inet_diag_handler udp_diag_handler = {
-        .dump            = udp_diag_dump,
-        .dump_one        = udp_diag_dump_one,
-        .idiag_get_info  = udp_diag_get_info,
-        .idiag_type      = IPPROTO_UDP,
-};
-static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-                struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
-        udp_dump(&udplite_table, skb, cb, r, bc);
-}
-static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
-                struct inet_diag_req_v2 *req)
-{
-        return udp_dump_one(&udplite_table, in_skb, nlh, req);
-}
-static const struct inet_diag_handler udplite_diag_handler = {
-        .dump            = udplite_diag_dump,
-        .dump_one        = udplite_diag_dump_one,
-        .idiag_get_info  = udp_diag_get_info,
-        .idiag_type      = IPPROTO_UDPLITE,
-};
-static int __init udp_diag_init(void)
-{
-        int err;
-        err = inet_diag_register(&udp_diag_handler);
-        if (err)
-                goto out;
-        err = inet_diag_register(&udplite_diag_handler);
-        if (err)
-                goto out_lite;
-out:
-        return err;
-out_lite:
-        inet_diag_unregister(&udp_diag_handler);
-        goto out;
-}
-static void __exit udp_diag_exit(void)
-{
-        inet_diag_unregister(&udplite_diag_handler);
-        inet_diag_unregister(&udp_diag_handler);
-}
-module_init(udp_diag_init);
-module_exit(udp_diag_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 5a681e298b9..aaad650d47d 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ extern int	udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                            size_t len, int noblock, int flags, int *addr_len);
 extern int      udp_sendpage(struct sock *sk, struct page *page, int offset,
                             size_t size, int flags);
-extern int      udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+extern int      udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
 extern void     udp_destroy_sock(struct sock *sk);
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2c46acd4cc3..aee9963f7f5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -10,10 +10,6 @@
 *              as published by the Free Software Foundation; either version
 *              2 of the License, or (at your option) any later version.
 */
-#define pr_fmt(fmt) "UDPLite: " fmt
-#include <linux/export.h>
 #include "udp_impl.h"
 struct udp_table        udplite_table __read_mostly;
@@ -75,20 +71,13 @@ static struct inet_protosw udplite4_protosw = {
 };
 #ifdef CONFIG_PROC_FS
-static const struct file_operations udplite_afinfo_seq_fops = {
-        .owner    = THIS_MODULE,
-        .open     = udp_seq_open,
-        .read     = seq_read,
-        .llseek   = seq_lseek,
-        .release  = seq_release_net
-};
 static struct udp_seq_afinfo udplite4_seq_afinfo = {
        .name           = "udplite",
        .family         = AF_INET,
        .udp_table      = &udplite_table,
-        .seq_fops       = &udplite_afinfo_seq_fops,
+        .seq_fops       = {
+                .owner  =       THIS_MODULE,
+        },
        .seq_ops        = {
                .show           = udp4_seq_show,
        },
@@ -132,11 +121,11 @@ void __init udplite4_register(void)
        inet_register_protosw(&udplite4_protosw);
        if (udplite4_proc_init())
-                pr_err("%s: Cannot register /proc!\n", __func__);
+                printk(KERN_ERR "%s: Cannot register /proc!\n", __func__);
        return;
 out_unregister_proto:
        proto_unregister(&udplite_prot);
 out_register_err:
-        pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
+        printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
 }
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e3db3f91511..63418185f52 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -110,7 +110,10 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
        skb_push(skb, sizeof(*iph));
        skb_reset_network_header(skb);
-        skb_mac_header_rebuild(skb);
+        memmove(skb->data - skb->mac_len, skb_mac_header(skb),
+                skb->mac_len);
+        skb_set_mac_header(skb, -skb->mac_len);
        xfrm4_beet_make_header(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ddee0a099a2..534972e114a 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,65 +15,6 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
-/* Informational hook. The decap is still done here. */
-static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
-static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
-int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
-{
-        struct xfrm_tunnel __rcu **pprev;
-        struct xfrm_tunnel *t;
-        int ret = -EEXIST;
-        int priority = handler->priority;
-        mutex_lock(&xfrm4_mode_tunnel_input_mutex);
-        for (pprev = &rcv_notify_handlers;
-             (t = rcu_dereference_protected(*pprev,
-             lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
-             pprev = &t->next) {
-                if (t->priority > priority)
-                        break;
-                if (t->priority == priority)
-                        goto err;
-        }
-        handler->next = *pprev;
-        rcu_assign_pointer(*pprev, handler);
-        ret = 0;
-err:
-        mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
-int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
-{
-        struct xfrm_tunnel __rcu **pprev;
-        struct xfrm_tunnel *t;
-        int ret = -ENOENT;
-        mutex_lock(&xfrm4_mode_tunnel_input_mutex);
-        for (pprev = &rcv_notify_handlers;
-             (t = rcu_dereference_protected(*pprev,
-             lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
-             pprev = &t->next) {
-                if (t == handler) {
-                        *pprev = handler->next;
-                        ret = 0;
-                        break;
-                }
-        }
-        mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
-        synchronize_net();
-        return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
 static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
 {
        struct iphdr *inner_iph = ipip_hdr(skb);
@@ -123,14 +64,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
        return 0;
 }
-#define for_each_input_rcu(head, handler)       \
-        for (handler = rcu_dereference(head);   \
-             handler != NULL;                   \
-             handler = rcu_dereference(handler->next))
 static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-        struct xfrm_tunnel *handler;
+        const unsigned char *old_mac;
        int err = -EINVAL;
        if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -139,9 +75,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto out;
-        for_each_input_rcu(rcv_notify_handlers, handler)
-                handler->handler(skb);
        if (skb_cloned(skb) &&
            (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
                goto out;
@@ -151,9 +84,10 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
        if (!(x->props.flags & XFRM_STATE_NOECN))
                ipip_ecn_decapsulate(skb);
+        old_mac = skb_mac_header(skb);
+        skb_set_mac_header(skb, -skb->mac_len);
+        memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        skb_reset_network_header(skb);
-        skb_mac_header_rebuild(skb);
        err = 0;
 out:
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3be0ac2c192..a0b4c5da8d4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,21 +79,30 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
        struct rtable *rt = (struct rtable *)xdst->route;
        const struct flowi4 *fl4 = &fl->u.ip4;
+        xdst->u.rt.rt_key_dst = fl4->daddr;
+        xdst->u.rt.rt_key_src = fl4->saddr;
+        xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
+        xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
        xdst->u.rt.rt_iif = fl4->flowi4_iif;
+        xdst->u.rt.rt_oif = fl4->flowi4_oif;
+        xdst->u.rt.rt_mark = fl4->flowi4_mark;
        xdst->u.dst.dev = dev;
        dev_hold(dev);
+        xdst->u.rt.peer = rt->peer;
+        if (rt->peer)
+                atomic_inc(&rt->peer->refcnt);
        /* Sheit... I remember I did this right. Apparently,
         * it was magically lost, so this code needs audit */
-        xdst->u.rt.rt_is_input = rt->rt_is_input;
        xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
                                              RTCF_LOCAL);
        xdst->u.rt.rt_type = rt->rt_type;
+        xdst->u.rt.rt_src = rt->rt_src;
+        xdst->u.rt.rt_dst = rt->rt_dst;
        xdst->u.rt.rt_gateway = rt->rt_gateway;
-        xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+        xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
-        xdst->u.rt.rt_pmtu = rt->rt_pmtu;
-        INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
        return 0;
 }
@@ -143,7 +152,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
                case IPPROTO_AH:
                        if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
-                                __be32 *ah_hdr = (__be32 *)xprth;
+                                __be32 *ah_hdr = (__be32*)xprth;
                                fl4->fl4_ipsec_spi = ah_hdr[1];
                        }
@@ -189,22 +198,12 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
        return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
-static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
-                              struct sk_buff *skb, u32 mtu)
-{
-        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
-        struct dst_entry *path = xdst->route;
-        path->ops->update_pmtu(path, sk, skb, mtu);
-}
-static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
-                           struct sk_buff *skb)
 {
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
        struct dst_entry *path = xdst->route;
-        path->ops->redirect(path, sk, skb);
+        path->ops->update_pmtu(path, mtu);
 }
 static void xfrm4_dst_destroy(struct dst_entry *dst)
@@ -213,6 +212,9 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
        dst_destroy_metrics_generic(dst);
+        if (likely(xdst->u.rt.peer))
+                inet_putpeer(xdst->u.rt.peer);
        xfrm_dst_destroy(xdst);
 }
@@ -230,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
        .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   xfrm4_garbage_collect,
        .update_pmtu =          xfrm4_update_pmtu,
-        .redirect =             xfrm4_redirect,
        .cow_metrics =          dst_cow_metrics_generic,
        .destroy =              xfrm4_dst_destroy,
        .ifdown =               xfrm4_dst_ifdown,
@@ -279,15 +280,26 @@ static void __exit xfrm4_policy_fini(void)
        xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
 }
-void __init xfrm4_init(void)
+void __init xfrm4_init(int rt_max_size)
 {
+        /*
+         * Select a default value for the gc_thresh based on the main route
+         * table hash size.  It seems to me the worst case scenario is when
+         * we have ipsec operating in transport mode, in which we create a
+         * dst_entry per socket.  The xfrm gc algorithm starts trying to remove
+         * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
+         * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
+         * That will let us store an ipsec connection per route table entry,
+         * and start cleaning when were 1/2 full
+         */
+        xfrm4_dst_ops.gc_thresh = rt_max_size/2;
        dst_entries_init(&xfrm4_dst_ops);
        xfrm4_state_init();
        xfrm4_policy_init();
 #ifdef CONFIG_SYSCTL
-        sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4",
+        sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
-                                         xfrm4_policy_table);
+                                                xfrm4_policy_table);
 #endif
 }
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 9258e751bab..d9ac0a0058b 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -12,7 +12,6 @@
 #include <linux/pfkeyv2.h>
 #include <linux/ipsec.h>
 #include <linux/netfilter_ipv4.h>
-#include <linux/export.h>
 static int xfrm4_init_flags(struct xfrm_state *x)
 {
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 05a5df2febc..82806455e85 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -3,8 +3,6 @@
 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
 */
-#define pr_fmt(fmt) "IPsec: " fmt
 #include <linux/skbuff.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -66,7 +64,7 @@ static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
        .priority       =       2,
 };
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
        .handler        =       xfrm_tunnel_rcv,
        .err_handler    =       xfrm_tunnel_err,
@@ -77,18 +75,18 @@ static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
 static int __init ipip_init(void)
 {
        if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
-                pr_info("%s: can't add xfrm type\n", __func__);
+                printk(KERN_INFO "ipip init: can't add xfrm type\n");
                return -EAGAIN;
        }
        if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
-                pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
+                printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n");
                xfrm_unregister_type(&ipip_type, AF_INET);
                return -EAGAIN;
        }
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
-                pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
+                printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n");
                xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
                xfrm_unregister_type(&ipip_type, AF_INET);
                return -EAGAIN;
@@ -99,16 +97,14 @@ static int __init ipip_init(void)
 static void __exit ipip_fini(void)
 {
-#if IS_ENABLED(CONFIG_IPV6)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
        if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
-                pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+                printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n");
-                        __func__);
 #endif
        if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
-                pr_info("%s: can't remove xfrm handler for AF_INET\n",
+                printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n");
-                        __func__);
        if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
-                pr_info("%s: can't remove xfrm type\n", __func__);
+                printk(KERN_INFO "ipip close: can't remove xfrm type\n");
 }
 module_init(ipip_init);
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
commit	8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree	a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ipv4
parent	406089d01562f1e2bf9f089fd7637009ebaad589 (diff)