44 files changed, 913 insertions, 429 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f8c49ce5b283..f032688d20d3 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -55,4 +55,4 @@ obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
-                      xfrm4_output.o
+                      xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 19ab78aca547..8c54870db792 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1505,9 +1505,9 @@ u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
                bhptr = per_cpu_ptr(mib[0], cpu);
                syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
                do {
-                        start = u64_stats_fetch_begin_bh(syncp);
+                        start = u64_stats_fetch_begin_irq(syncp);
                        v = *(((u64 *) bhptr) + offt);
-                } while (u64_stats_fetch_retry_bh(syncp, start));
+                } while (u64_stats_fetch_retry_irq(syncp, start));
                res += v;
        }
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 717902669d2f..a2afa89513a0 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -155,6 +155,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
        struct iphdr *iph, *top_iph;
        struct ip_auth_hdr *ah;
        struct ah_data *ahp;
+        int seqhi_len = 0;
+        __be32 *seqhi;
+        int sglists = 0;
+        struct scatterlist *seqhisg;
        ahp = x->data;
        ahash = ahp->ahash;
@@ -167,14 +171,19 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
        ah = ip_auth_hdr(skb);
        ihl = ip_hdrlen(skb);
+        if (x->props.flags & XFRM_STATE_ESN) {
+                sglists = 1;
+                seqhi_len = sizeof(*seqhi);
+        }
        err = -ENOMEM;
-        iph = ah_alloc_tmp(ahash, nfrags, ihl);
+        iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
        if (!iph)
                goto out;
+        seqhi = (__be32 *)((char *)iph + ihl);
-        icv = ah_tmp_icv(ahash, iph, ihl);
+        icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
        req = ah_tmp_req(ahash, icv);
        sg = ah_req_sg(ahash, req);
+        seqhisg = sg + nfrags;
        memset(ah->auth_data, 0, ahp->icv_trunc_len);
@@ -210,10 +219,15 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
        ah->spi = x->id.spi;
        ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
-        sg_init_table(sg, nfrags);
+        sg_init_table(sg, nfrags + sglists);
-        skb_to_sgvec(skb, sg, 0, skb->len);
+        skb_to_sgvec_nomark(skb, sg, 0, skb->len);
-        ahash_request_set_crypt(req, sg, icv, skb->len);
+        if (x->props.flags & XFRM_STATE_ESN) {
+                /* Attach seqhi sg right after packet payload */
+                *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+                sg_set_buf(seqhisg, seqhi, seqhi_len);
+        }
+        ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
        ahash_request_set_callback(req, 0, ah_output_done, skb);
        AH_SKB_CB(skb)->tmp = iph;
@@ -295,6 +309,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
        struct ip_auth_hdr *ah;
        struct ah_data *ahp;
        int err = -ENOMEM;
+        int seqhi_len = 0;
+        __be32 *seqhi;
+        int sglists = 0;
+        struct scatterlist *seqhisg;
        if (!pskb_may_pull(skb, sizeof(*ah)))
                goto out;
@@ -335,14 +353,22 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
        iph = ip_hdr(skb);
        ihl = ip_hdrlen(skb);
-        work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+        if (x->props.flags & XFRM_STATE_ESN) {
+                sglists = 1;
+                seqhi_len = sizeof(*seqhi);
+        }
+        work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+                                ahp->icv_trunc_len + seqhi_len);
        if (!work_iph)
                goto out;
-        auth_data = ah_tmp_auth(work_iph, ihl);
+        seqhi = (__be32 *)((char *)work_iph + ihl);
+        auth_data = ah_tmp_auth(seqhi, seqhi_len);
        icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
        req = ah_tmp_req(ahash, icv);
        sg = ah_req_sg(ahash, req);
+        seqhisg = sg + nfrags;
        memcpy(work_iph, iph, ihl);
        memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
@@ -361,10 +387,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
        skb_push(skb, ihl);
-        sg_init_table(sg, nfrags);
+        sg_init_table(sg, nfrags + sglists);
-        skb_to_sgvec(skb, sg, 0, skb->len);
+        skb_to_sgvec_nomark(skb, sg, 0, skb->len);
-        ahash_request_set_crypt(req, sg, icv, skb->len);
+        if (x->props.flags & XFRM_STATE_ESN) {
+                /* Attach seqhi sg right after packet payload */
+                *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+                sg_set_buf(seqhisg, seqhi, seqhi_len);
+        }
+        ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
        ahash_request_set_callback(req, 0, ah_input_done, skb);
        AH_SKB_CB(skb)->tmp = work_iph;
@@ -397,7 +428,7 @@ out:
        return err;
 }
-static void ah4_err(struct sk_buff *skb, u32 info)
+static int ah4_err(struct sk_buff *skb, u32 info)
 {
        struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -407,23 +438,25 @@ static void ah4_err(struct sk_buff *skb, u32 info)
        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                        return;
+                        return 0;
        case ICMP_REDIRECT:
                break;
        default:
-                return;
+                return 0;
        }
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              ah->spi, IPPROTO_AH, AF_INET);
        if (!x)
-                return;
+                return 0;
        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
        else
                ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
        xfrm_state_put(x);
+        return 0;
 }
 static int ah_init_state(struct xfrm_state *x)
@@ -505,6 +538,10 @@ static void ah_destroy(struct xfrm_state *x)
        kfree(ahp);
 }
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+        return 0;
+}
 static const struct xfrm_type ah_type =
 {
@@ -518,11 +555,12 @@ static const struct xfrm_type ah_type =
        .output         = ah_output
 };
-static const struct net_protocol ah4_protocol = {
+static struct xfrm4_protocol ah4_protocol = {
        .handler        =       xfrm4_rcv,
+        .input_handler  =       xfrm_input,
+        .cb_handler     =       ah4_rcv_cb,
        .err_handler    =       ah4_err,
-        .no_policy      =       1,
+        .priority       =       0,
-        .netns_ok       =       1,
 };
 static int __init ah4_init(void)
@@ -531,7 +569,7 @@ static int __init ah4_init(void)
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
-        if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+        if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ah_type, AF_INET);
                return -EAGAIN;
@@ -541,7 +579,7 @@ static int __init ah4_init(void)
 static void __exit ah4_fini(void)
 {
-        if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+        if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
                pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 7785b28061ac..360b565918c4 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -473,7 +473,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
                 net_adj) & ~(blksize - 1)) + net_adj - 2;
 }
-static void esp4_err(struct sk_buff *skb, u32 info)
+static int esp4_err(struct sk_buff *skb, u32 info)
 {
        struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -483,23 +483,25 @@ static void esp4_err(struct sk_buff *skb, u32 info)
        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                        return;
+                        return 0;
        case ICMP_REDIRECT:
                break;
        default:
-                return;
+                return 0;
        }
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              esph->spi, IPPROTO_ESP, AF_INET);
        if (!x)
-                return;
+                return 0;
        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
        else
                ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
        xfrm_state_put(x);
+        return 0;
 }
 static void esp_destroy(struct xfrm_state *x)
@@ -672,6 +674,11 @@ error:
        return err;
 }
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+        return 0;
+}
 static const struct xfrm_type esp_type =
 {
        .description    = "ESP4",
@@ -685,11 +692,12 @@ static const struct xfrm_type esp_type =
        .output         = esp_output
 };
-static const struct net_protocol esp4_protocol = {
+static struct xfrm4_protocol esp4_protocol = {
        .handler        =       xfrm4_rcv,
+        .input_handler  =       xfrm_input,
+        .cb_handler     =       esp4_rcv_cb,
        .err_handler    =       esp4_err,
-        .no_policy      =       1,
+        .priority       =       0,
-        .netns_ok       =       1,
 };
 static int __init esp4_init(void)
@@ -698,7 +706,7 @@ static int __init esp4_init(void)
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
-        if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+        if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&esp_type, AF_INET);
                return -EAGAIN;
@@ -708,7 +716,7 @@ static int __init esp4_init(void)
 static void __exit esp4_fini(void)
 {
-        if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+        if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
                pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c7539e22868b..1a629f870274 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -659,7 +659,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
        if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
            ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
-                return ip_rt_dump(skb, cb);
+                return skb->len;
        s_h = cb->args[0];
        s_e = cb->args[1];
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index f3869c186d97..be8abe73bb9f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -127,6 +127,10 @@ int ip_forward(struct sk_buff *skb)
        struct rtable *rt;      /* Route we use */
        struct ip_options *opt  = &(IPCB(skb)->opt);
+        /* that should never happen */
+        if (skb->pkt_type != PACKET_HOST)
+                goto drop;
        if (skb_warn_if_lro(skb))
                goto drop;
@@ -136,9 +140,6 @@ int ip_forward(struct sk_buff *skb)
        if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
                return NET_RX_SUCCESS;
-        if (skb->pkt_type != PACKET_HOST)
-                goto drop;
        skb_forward_csum(skb);
        /*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 73c6b63bba74..1a0755fea491 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -446,7 +446,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
        __be16 not_last_frag;
        struct rtable *rt = skb_rtable(skb);
        int err = 0;
-        bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
        dev = rt->dst.dev;
@@ -456,7 +455,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
        iph = ip_hdr(skb);
-        mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding);
+        mtu = ip_skb_dst_mtu(skb);
        if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
                     (IPCB(skb)->frag_max_size &&
                      IPCB(skb)->frag_max_size > mtu))) {
@@ -822,8 +821,7 @@ static int __ip_append_data(struct sock *sk,
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-        maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ?
+        maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
-                         mtu : 0xFFFF;
        if (cork->length + length > maxnonfragsize - fragheaderlen) {
                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1146,8 +1144,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-        maxnonfragsize = (inet->pmtudisc >= IP_PMTUDISC_DO) ?
+        maxnonfragsize = ip_sk_local_df(sk) ? 0xFFFF : mtu;
-                         mtu : 0xFFFF;
        if (cork->length + size > maxnonfragsize - fragheaderlen) {
                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -1308,8 +1305,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
         * to fragment the frame generated here. No matter, what transforms
         * how transforms change size of the packet, it will come out.
         */
-        if (inet->pmtudisc < IP_PMTUDISC_DO)
+        skb->local_df = ip_sk_local_df(sk);
-                skb->local_df = 1;
        /* DF bit is set when we want to see DF on outgoing frames.
         * If local_df is set too, we still allow to fragment this frame
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 580dd96666e0..64741b938632 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -186,7 +186,8 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ip_cmsg_recv);
-int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
+int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+                 bool allow_ipv6)
 {
        int err, val;
        struct cmsghdr *cmsg;
@@ -194,6 +195,22 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
        for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
+#if defined(CONFIG_IPV6)
+                if (allow_ipv6 &&
+                    cmsg->cmsg_level == SOL_IPV6 &&
+                    cmsg->cmsg_type == IPV6_PKTINFO) {
+                        struct in6_pktinfo *src_info;
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+                                return -EINVAL;
+                        src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+                        if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+                                return -EINVAL;
+                        ipc->oif = src_info->ipi6_ifindex;
+                        ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+                        continue;
+                }
+#endif
                if (cmsg->cmsg_level != SOL_IP)
                        continue;
                switch (cmsg->cmsg_type) {
@@ -626,7 +643,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                inet->nodefrag = val ? 1 : 0;
                break;
        case IP_MTU_DISCOVER:
-                if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_INTERFACE)
+                if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
                        goto e_inval;
                inet->pmtudisc = val;
                break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a82a22d8f77f..e77381d1df9a 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -235,13 +235,17 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 {
        unsigned int h;
        __be32 remote;
+        __be32 i_key = parms->i_key;
        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
                remote = parms->iph.daddr;
        else
                remote = 0;
-        h = ip_tunnel_hash(parms->i_key, remote);
+        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+                i_key = 0;
+        h = ip_tunnel_hash(i_key, remote);
        return &itn->tunnels[h];
 }
@@ -398,7 +402,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
        fbt = netdev_priv(itn->fb_tunnel_dev);
        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
        if (IS_ERR(dev))
-                return NULL;
+                return ERR_CAST(dev);
        dev->mtu = ip_tunnel_bind_dev(dev);
@@ -748,9 +752,13 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
-                if (!t && (cmd == SIOCADDTUNNEL))
+                if (!t && (cmd == SIOCADDTUNNEL)) {
                        t = ip_tunnel_create(net, itn, p);
+                        if (IS_ERR(t)) {
+                                err = PTR_ERR(t);
+                                break;
+                        }
+                }
                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
                        if (t != NULL) {
                                if (t->dev != dev) {
@@ -777,8 +785,9 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
                if (t) {
                        err = 0;
                        ip_tunnel_update(itn, t, dev, p, true);
-                } else
+                } else {
-                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+                        err = -ENOENT;
+                }
                break;
        case SIOCDELTUNNEL:
@@ -993,19 +1002,13 @@ int ip_tunnel_init(struct net_device *dev)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
        struct iphdr *iph = &tunnel->parms.iph;
-        int i, err;
+        int err;
        dev->destructor = ip_tunnel_dev_free;
-        dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
+        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
        if (!dev->tstats)
                return -ENOMEM;
-        for_each_possible_cpu(i) {
-                struct pcpu_sw_netstats *ipt_stats;
-                ipt_stats = per_cpu_ptr(dev->tstats, i);
-                u64_stats_init(&ipt_stats->syncp);
-        }
        tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
        if (!tunnel->dst_cache) {
                free_percpu(dev->tstats);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 8d69626f2206..e0c2b1d2ea4e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -162,12 +162,12 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
                unsigned int start;
                do {
-                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
+                        start = u64_stats_fetch_begin_irq(&tstats->syncp);
                        rx_packets = tstats->rx_packets;
                        tx_packets = tstats->tx_packets;
                        rx_bytes = tstats->rx_bytes;
                        tx_bytes = tstats->tx_bytes;
-                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
+                } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
                tot->rx_packets += rx_packets;
                tot->tx_packets += tx_packets;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 48eafae51769..687ddef4e574 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <linux/icmpv6.h>
 #include <net/sock.h>
 #include <net/ip.h>
@@ -49,8 +50,8 @@ static struct rtnl_link_ops vti_link_ops __read_mostly;
 static int vti_net_id __read_mostly;
 static int vti_tunnel_init(struct net_device *dev);
-/* We dont digest the packet therefore let the packet pass */
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
-static int vti_rcv(struct sk_buff *skb)
+                     int encap_type)
 {
        struct ip_tunnel *tunnel;
        const struct iphdr *iph = ip_hdr(skb);
@@ -60,79 +61,120 @@ static int vti_rcv(struct sk_buff *skb)
        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
                                  iph->saddr, iph->daddr, 0);
        if (tunnel != NULL) {
-                struct pcpu_sw_netstats *tstats;
+                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-                u32 oldmark = skb->mark;
+                        goto drop;
-                int ret;
+                XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+                skb->mark = be32_to_cpu(tunnel->parms.i_key);
-                /* temporarily mark the skb with the tunnel o_key, to
-                 * only match policies with this mark.
+                return xfrm_input(skb, nexthdr, spi, encap_type);
-                 */
+        }
-                skb->mark = be32_to_cpu(tunnel->parms.o_key);
-                ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb);
+        return -EINVAL;
-                skb->mark = oldmark;
+drop:
-                if (!ret)
+        kfree_skb(skb);
-                        return -1;
+        return 0;
+}
-                tstats = this_cpu_ptr(tunnel->dev->tstats);
-                u64_stats_update_begin(&tstats->syncp);
+static int vti_rcv(struct sk_buff *skb)
-                tstats->rx_packets++;
+{
-                tstats->rx_bytes += skb->len;
+        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
-                u64_stats_update_end(&tstats->syncp);
+        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
-                secpath_reset(skb);
+        return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
-                skb->dev = tunnel->dev;
+}
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+        unsigned short family;
+        struct net_device *dev;
+        struct pcpu_sw_netstats *tstats;
+        struct xfrm_state *x;
+        struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+        if (!tunnel)
                return 1;
+        dev = tunnel->dev;
+        if (err) {
+                dev->stats.rx_errors++;
+                dev->stats.rx_dropped++;
+                return 0;
        }
-        return -1;
+        x = xfrm_input_state(skb);
+        family = x->inner_mode->afinfo->family;
+        if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+                return -EPERM;
+        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+        skb->dev = dev;
+        tstats = this_cpu_ptr(dev->tstats);
+        u64_stats_update_begin(&tstats->syncp);
+        tstats->rx_packets++;
+        tstats->rx_bytes += skb->len;
+        u64_stats_update_end(&tstats->syncp);
+        return 0;
 }
-/* This function assumes it is being called from dev_queue_xmit()
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
- * and that skb is filled properly by that function.
+{
- */
+        xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+        xfrm_address_t *saddr = (xfrm_address_t *)&src;
-static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+        /* if there is no transform then this tunnel is not functional.
+         * Or if the xfrm is not mode tunnel.
+         */
+        if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+            x->props.family != AF_INET)
+                return false;
+        if (!dst)
+                return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+        if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+                return false;
+        return true;
+}
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+                            struct flowi *fl)
 {
        struct ip_tunnel *tunnel = netdev_priv(dev);
-        struct iphdr  *tiph = &tunnel->parms.iph;
+        struct ip_tunnel_parm *parms = &tunnel->parms;
-        u8     tos;
+        struct dst_entry *dst = skb_dst(skb);
-        struct rtable *rt;              /* Route to the other host */
        struct net_device *tdev;        /* Device to other host */
-        struct iphdr  *old_iph = ip_hdr(skb);
-        __be32 dst = tiph->daddr;
-        struct flowi4 fl4;
        int err;
-        if (skb->protocol != htons(ETH_P_IP))
+        if (!dst) {
-                goto tx_error;
+                dev->stats.tx_carrier_errors++;
+                goto tx_error_icmp;
-        tos = old_iph->tos;
+        }
-        memset(&fl4, 0, sizeof(fl4));
+        dst_hold(dst);
-        flowi4_init_output(&fl4, tunnel->parms.link,
+        dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
-                           be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos),
+        if (IS_ERR(dst)) {
-                           RT_SCOPE_UNIVERSE,
-                           IPPROTO_IPIP, 0,
-                           dst, tiph->saddr, 0, 0);
-        rt = ip_route_output_key(dev_net(dev), &fl4);
-        if (IS_ERR(rt)) {
                dev->stats.tx_carrier_errors++;
                goto tx_error_icmp;
        }
-        /* if there is no transform then this tunnel is not functional.
-         * Or if the xfrm is not mode tunnel.
+        if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
-         */
-        if (!rt->dst.xfrm ||
-            rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
                dev->stats.tx_carrier_errors++;
-                ip_rt_put(rt);
+                dst_release(dst);
                goto tx_error_icmp;
        }
-        tdev = rt->dst.dev;
+        tdev = dst->dev;
        if (tdev == dev) {
-                ip_rt_put(rt);
+                dst_release(dst);
                dev->stats.collisions++;
                goto tx_error;
        }
@@ -146,10 +188,8 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
                        tunnel->err_count = 0;
        }
-        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
-        skb_dst_drop(skb);
+        skb_dst_set(skb, dst);
-        skb_dst_set(skb, &rt->dst);
-        nf_reset(skb);
        skb->dev = skb_dst(skb)->dev;
        err = dst_output(skb);
@@ -166,6 +206,95 @@ tx_error:
        return NETDEV_TX_OK;
 }
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        struct ip_tunnel *tunnel = netdev_priv(dev);
+        struct flowi fl;
+        memset(&fl, 0, sizeof(fl));
+        skb->mark = be32_to_cpu(tunnel->parms.o_key);
+        switch (skb->protocol) {
+        case htons(ETH_P_IP):
+                xfrm_decode_session(skb, &fl, AF_INET);
+                memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+                break;
+        case htons(ETH_P_IPV6):
+                xfrm_decode_session(skb, &fl, AF_INET6);
+                memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+                break;
+        default:
+                dev->stats.tx_errors++;
+                dev_kfree_skb(skb);
+                return NETDEV_TX_OK;
+        }
+        return vti_xmit(skb, dev, &fl);
+}
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+        __be32 spi;
+        struct xfrm_state *x;
+        struct ip_tunnel *tunnel;
+        struct ip_esp_hdr *esph;
+        struct ip_auth_hdr *ah ;
+        struct ip_comp_hdr *ipch;
+        struct net *net = dev_net(skb->dev);
+        const struct iphdr *iph = (const struct iphdr *)skb->data;
+        int protocol = iph->protocol;
+        struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+                                  iph->daddr, iph->saddr, 0);
+        if (!tunnel)
+                return -1;
+        switch (protocol) {
+        case IPPROTO_ESP:
+                esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+                spi = esph->spi;
+                break;
+        case IPPROTO_AH:
+                ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+                spi = ah->spi;
+                break;
+        case IPPROTO_COMP:
+                ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+                spi = htonl(ntohs(ipch->cpi));
+                break;
+        default:
+                return 0;
+        }
+        switch (icmp_hdr(skb)->type) {
+        case ICMP_DEST_UNREACH:
+                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+                        return 0;
+        case ICMP_REDIRECT:
+                break;
+        default:
+                return 0;
+        }
+        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+                              spi, protocol, AF_INET);
+        if (!x)
+                return 0;
+        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+                ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+        else
+                ipv4_redirect(skb, net, 0, 0, protocol, 0);
+        xfrm_state_put(x);
+        return 0;
+}
 static int
 vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
@@ -181,12 +310,13 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
                        return -EINVAL;
        }
+        p.i_flags |= VTI_ISVTI;
        err = ip_tunnel_ioctl(dev, &p, cmd);
        if (err)
                return err;
        if (cmd != SIOCDELTUNNEL) {
-                p.i_flags |= GRE_KEY | VTI_ISVTI;
+                p.i_flags |= GRE_KEY;
                p.o_flags |= GRE_KEY;
        }
@@ -224,7 +354,6 @@ static int vti_tunnel_init(struct net_device *dev)
        dev->flags              = IFF_NOARP;
        dev->iflink             = 0;
        dev->addr_len           = 4;
-        dev->features           |= NETIF_F_NETNS_LOCAL;
        dev->features           |= NETIF_F_LLTX;
        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
@@ -241,9 +370,28 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
        iph->ihl                = 5;
 }
-static struct xfrm_tunnel_notifier vti_handler __read_mostly = {
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
        .handler        =       vti_rcv,
-        .priority       =       1,
+        .input_handler  =       vti_input,
+        .cb_handler     =       vti_rcv_cb,
+        .err_handler    =       vti4_err,
+        .priority       =       100,
+};
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+        .handler        =       vti_rcv,
+        .input_handler  =       vti_input,
+        .cb_handler     =       vti_rcv_cb,
+        .err_handler    =       vti4_err,
+        .priority       =       100,
+};
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+        .handler        =       vti_rcv,
+        .input_handler  =       vti_input,
+        .cb_handler     =       vti_rcv_cb,
+        .err_handler    =       vti4_err,
+        .priority       =       100,
 };
 static int __net_init vti_init_net(struct net *net)
@@ -287,6 +435,8 @@ static void vti_netlink_parms(struct nlattr *data[],
        if (!data)
                return;
+        parms->i_flags = VTI_ISVTI;
        if (data[IFLA_VTI_LINK])
                parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
@@ -382,10 +532,31 @@ static int __init vti_init(void)
        err = register_pernet_device(&vti_net_ops);
        if (err < 0)
                return err;
-        err = xfrm4_mode_tunnel_input_register(&vti_handler);
+        err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+        if (err < 0) {
+                unregister_pernet_device(&vti_net_ops);
+                pr_info("vti init: can't register tunnel\n");
+                return err;
+        }
+        err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+        if (err < 0) {
+                xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+                unregister_pernet_device(&vti_net_ops);
+                pr_info("vti init: can't register tunnel\n");
+                return err;
+        }
+        err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
        if (err < 0) {
+                xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+                xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
                unregister_pernet_device(&vti_net_ops);
                pr_info("vti init: can't register tunnel\n");
+                return err;
        }
        err = rtnl_link_register(&vti_link_ops);
@@ -395,7 +566,9 @@ static int __init vti_init(void)
        return err;
 rtnl_link_failed:
-        xfrm4_mode_tunnel_input_deregister(&vti_handler);
+        xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+        xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+        xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
        unregister_pernet_device(&vti_net_ops);
        return err;
 }
@@ -403,8 +576,13 @@ rtnl_link_failed:
 static void __exit vti_fini(void)
 {
        rtnl_link_unregister(&vti_link_ops);
-        if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
+        if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
                pr_info("vti close: can't deregister tunnel\n");
+        if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
+                pr_info("vti close: can't deregister tunnel\n");
+        if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
+                pr_info("vti close: can't deregister tunnel\n");
        unregister_pernet_device(&vti_net_ops);
 }
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 826be4cb482a..c0855d50a3fa 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -23,7 +23,7 @@
 #include <net/protocol.h>
 #include <net/sock.h>
-static void ipcomp4_err(struct sk_buff *skb, u32 info)
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
 {
        struct net *net = dev_net(skb->dev);
        __be32 spi;
@@ -34,24 +34,26 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
-                        return;
+                        return 0;
        case ICMP_REDIRECT:
                break;
        default:
-                return;
+                return 0;
        }
        spi = htonl(ntohs(ipch->cpi));
        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, IPPROTO_COMP, AF_INET);
        if (!x)
-                return;
+                return 0;
        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
        else
                ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
        xfrm_state_put(x);
+        return 0;
 }
 /* We always hold one tunnel user reference to indicate a tunnel */
@@ -147,6 +149,11 @@ out:
        return err;
 }
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+        return 0;
+}
 static const struct xfrm_type ipcomp_type = {
        .description    = "IPCOMP4",
        .owner          = THIS_MODULE,
@@ -157,11 +164,12 @@ static const struct xfrm_type ipcomp_type = {
        .output         = ipcomp_output
 };
-static const struct net_protocol ipcomp4_protocol = {
+static struct xfrm4_protocol ipcomp4_protocol = {
        .handler        =       xfrm4_rcv,
+        .input_handler  =       xfrm_input,
+        .cb_handler     =       ipcomp4_rcv_cb,
        .err_handler    =       ipcomp4_err,
-        .no_policy      =       1,
+        .priority       =       0,
-        .netns_ok       =       1,
 };
 static int __init ipcomp4_init(void)
@@ -170,7 +178,7 @@ static int __init ipcomp4_init(void)
                pr_info("%s: can't add xfrm type\n", __func__);
                return -EAGAIN;
        }
-        if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+        if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
                pr_info("%s: can't add protocol\n", __func__);
                xfrm_unregister_type(&ipcomp_type, AF_INET);
                return -EAGAIN;
@@ -180,7 +188,7 @@ static int __init ipcomp4_init(void)
 static void __exit ipcomp4_fini(void)
 {
-        if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+        if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
                pr_info("%s: can't remove protocol\n", __func__);
        if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
                pr_info("%s: can't remove xfrm type\n", __func__);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3e0adea9c27..7ebd6e37875c 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -61,7 +61,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
                skb_dst_set(skb, NULL);
                dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
                if (IS_ERR(dst))
-                        return PTR_ERR(dst);;
+                        return PTR_ERR(dst);
                skb_dst_set(skb, dst);
        }
 #endif
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 59da7cde0724..f95b6f93814b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,
        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
-                         sizeof(struct xt_counters) * num_counters) != 0)
+                         sizeof(struct xt_counters) * num_counters) != 0) {
-                ret = -EFAULT;
+                /* Silent error, can't fail, new table is already in place */
+                net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+        }
        vfree(counters);
        xt_table_unlock(t);
        return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 718dfbd30cbe..99e810f84671 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
-                         sizeof(struct xt_counters) * num_counters) != 0)
+                         sizeof(struct xt_counters) * num_counters) != 0) {
-                ret = -EFAULT;
+                /* Silent error, can't fail, new table is already in place */
+                net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+        }
        vfree(counters);
        xt_table_unlock(t);
        return ret;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2d11c094296e..f4b19e5dde54 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -727,7 +727,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
        sock_tx_timestamp(sk, &ipc.tx_flags);
        if (msg->msg_controllen) {
-                err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+                err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
                if (err)
                        return err;
                if (ipc.opt)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a6c8a80ec9d6..ad737fad6d8b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
        SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
        SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+        SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
        SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
        SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
        SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
@@ -280,6 +281,11 @@ static const struct snmp_mib snmp4_net_list[] = {
        SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
        SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
        SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+        SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+        SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+        SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+        SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+        SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
        SNMP_MIB_SENTINEL
 };
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index c04518f4850a..a9dbe58bdfe7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -524,7 +524,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        ipc.oif = sk->sk_bound_dev_if;
        if (msg->msg_controllen) {
-                err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+                err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
                if (err)
                        goto out;
                if (ipc.opt)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4c011ec69ed4..34d094cadb11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,11 +139,6 @@ static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
 static void             ipv4_dst_destroy(struct dst_entry *dst);
-static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
-                            int how)
-{
-}
 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 {
        WARN_ON(1);
@@ -162,7 +157,6 @@ static struct dst_ops ipv4_dst_ops = {
        .mtu =                  ipv4_mtu,
        .cow_metrics =          ipv4_cow_metrics,
        .destroy =              ipv4_dst_destroy,
-        .ifdown =               ipv4_dst_ifdown,
        .negative_advice =      ipv4_negative_advice,
        .link_failure =         ipv4_link_failure,
        .update_pmtu =          ip_rt_update_pmtu,
@@ -194,7 +188,7 @@ const __u8 ip_tos2prio[16] = {
 EXPORT_SYMBOL(ip_tos2prio);
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 #ifdef CONFIG_PROC_FS
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -697,7 +691,6 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 out_unlock:
        spin_unlock_bh(&fnhe_lock);
-        return;
 }
 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
@@ -2475,11 +2468,6 @@ errout_free:
        goto errout;
 }
-int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
-{
-        return skb->len;
-}
 void ip_rt_multicast_event(struct in_device *in_dev)
 {
        rt_cache_flush(dev_net(in_dev->dev));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 97c8f5620c43..4bd6d52eeffb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
        INIT_LIST_HEAD(&tp->tsq_node);
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
-        tp->mdev = TCP_TIMEOUT_INIT;
+        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
        /* So many TCP implementations out there (incorrectly) count the
         * initial SYN frame in their delayed-ACK and congestion control
@@ -2341,7 +2341,7 @@ int tcp_disconnect(struct sock *sk, int flags)
        sk->sk_shutdown = 0;
        sock_reset_flag(sk, SOCK_DONE);
-        tp->srtt = 0;
+        tp->srtt_us = 0;
        if ((tp->write_seq += tp->max_window + 2) == 0)
                tp->write_seq = 1;
        icsk->icsk_backoff = 0;
@@ -2785,8 +2785,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
        info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
        info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
-        info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
+        info->tcpi_rtt = tp->srtt_us >> 3;
-        info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+        info->tcpi_rttvar = tp->mdev_us >> 2;
        info->tcpi_snd_ssthresh = tp->snd_ssthresh;
        info->tcpi_snd_cwnd = tp->snd_cwnd;
        info->tcpi_advmss = tp->advmss;
@@ -2796,6 +2796,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
        info->tcpi_rcv_space = tp->rcvq_space.space;
        info->tcpi_total_retrans = tp->total_retrans;
+        info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
+                                        sk->sk_pacing_rate : ~0ULL;
+        info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
+                                        sk->sk_max_pacing_rate : ~0ULL;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2388275adb9b..2b9464c93b88 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -361,21 +361,12 @@ u32 tcp_reno_ssthresh(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-/* Lower bound on congestion window with halving. */
-u32 tcp_reno_min_cwnd(const struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        return tp->snd_ssthresh/2;
-}
-EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
 struct tcp_congestion_ops tcp_reno = {
        .flags          = TCP_CONG_NON_RESTRICTED,
        .name           = "reno",
        .owner          = THIS_MODULE,
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
-        .min_cwnd       = tcp_reno_min_cwnd,
 };
 /* Initial congestion control used (until SYN)
@@ -387,6 +378,5 @@ struct tcp_congestion_ops tcp_init_congestion_ops  = {
        .owner          = THIS_MODULE,
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
-        .min_cwnd       = tcp_reno_min_cwnd,
 };
 EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 828e4c3ffbaf..8bf224516ba2 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -476,10 +476,6 @@ static int __init cubictcp_register(void)
        /* divide by bic_scale and by constant Srtt (100ms) */
        do_div(cube_factor, bic_scale * 10);
-        /* hystart needs ms clock resolution */
-        if (hystart && HZ < 1000)
-                cubictcp.flags |= TCP_CONG_RTT_STAMP;
        return tcp_register_congestion_control(&cubictcp);
 }
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8ed9305dfdf4..8b9e7bad77c0 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -162,7 +162,6 @@ static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
        .init           = hstcp_init,
        .ssthresh       = hstcp_ssthresh,
        .cong_avoid     = hstcp_cong_avoid,
-        .min_cwnd       = tcp_reno_min_cwnd,
        .owner          = THIS_MODULE,
        .name           = "highspeed"
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 478fe82611bf..a15a799bf768 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -21,7 +21,7 @@ struct hybla {
        u32   rho2;           /* Rho * Rho, integer part */
        u32   rho_3ls;        /* Rho parameter, <<3 */
        u32   rho2_7ls;       /* Rho^2, <<7     */
-        u32   minrtt;         /* Minimum smoothed round trip time value seen */
+        u32   minrtt_us;      /* Minimum smoothed round trip time value seen */
 };
 /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)
 {
        struct hybla *ca = inet_csk_ca(sk);
-        ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
+        ca->rho_3ls = max_t(u32,
+                            tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+                            8U);
        ca->rho = ca->rho_3ls >> 3;
        ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
        ca->rho2 = ca->rho2_7ls >> 7;
@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)
        hybla_recalc_param(sk);
        /* set minimum rtt as this is the 1st ever seen */
-        ca->minrtt = tp->srtt;
+        ca->minrtt_us = tp->srtt_us;
        tp->snd_cwnd = ca->rho;
 }
@@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
        int is_slowstart = 0;
        /*  Recalculate rho only if this srtt is the lowest */
-        if (tp->srtt < ca->minrtt){
+        if (tp->srtt_us < ca->minrtt_us) {
                hybla_recalc_param(sk);
-                ca->minrtt = tp->srtt;
+                ca->minrtt_us = tp->srtt_us;
        }
        if (!tcp_is_cwnd_limited(sk, in_flight))
@@ -166,7 +168,6 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
 static struct tcp_congestion_ops tcp_hybla __read_mostly = {
        .init           = hybla_init,
        .ssthresh       = tcp_reno_ssthresh,
-        .min_cwnd       = tcp_reno_min_cwnd,
        .cong_avoid     = hybla_cong_avoid,
        .set_state      = hybla_state,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index e498a62b8f97..863d105e3015 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -325,10 +325,8 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
 }
 static struct tcp_congestion_ops tcp_illinois __read_mostly = {
-        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_illinois_init,
        .ssthresh       = tcp_illinois_ssthresh,
-        .min_cwnd       = tcp_reno_min_cwnd,
        .cong_avoid     = tcp_illinois_cong_avoid,
        .set_state      = tcp_illinois_state,
        .get_info       = tcp_illinois_info,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eeaac399420d..e1661f46fd19 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -667,11 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        long m = mrtt; /* RTT */
+        long m = mrtt_us; /* RTT */
-        u32 srtt = tp->srtt;
+        u32 srtt = tp->srtt_us;
        /*      The following amusing code comes from Jacobson's
         *      article in SIGCOMM '88.  Note that rtt and mdev
@@ -694,7 +694,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
                srtt += m;              /* rtt = 7/8 rtt + 1/8 new */
                if (m < 0) {
                        m = -m;         /* m is now abs(error) */
-                        m -= (tp->mdev >> 2);   /* similar update on mdev */
+                        m -= (tp->mdev_us >> 2);   /* similar update on mdev */
                        /* This is similar to one of Eifel findings.
                         * Eifel blocks mdev updates when rtt decreases.
                         * This solution is a bit different: we use finer gain
@@ -706,28 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
                        if (m > 0)
                                m >>= 3;
                } else {
-                        m -= (tp->mdev >> 2);   /* similar update on mdev */
+                        m -= (tp->mdev_us >> 2);   /* similar update on mdev */
                }
-                tp->mdev += m;          /* mdev = 3/4 mdev + 1/4 new */
+                tp->mdev_us += m;               /* mdev = 3/4 mdev + 1/4 new */
-                if (tp->mdev > tp->mdev_max) {
+                if (tp->mdev_us > tp->mdev_max_us) {
-                        tp->mdev_max = tp->mdev;
+                        tp->mdev_max_us = tp->mdev_us;
-                        if (tp->mdev_max > tp->rttvar)
+                        if (tp->mdev_max_us > tp->rttvar_us)
-                                tp->rttvar = tp->mdev_max;
+                                tp->rttvar_us = tp->mdev_max_us;
                }
                if (after(tp->snd_una, tp->rtt_seq)) {
-                        if (tp->mdev_max < tp->rttvar)
+                        if (tp->mdev_max_us < tp->rttvar_us)
-                                tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
+                                tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
                        tp->rtt_seq = tp->snd_nxt;
-                        tp->mdev_max = tcp_rto_min(sk);
+                        tp->mdev_max_us = tcp_rto_min_us(sk);
                }
        } else {
                /* no previous measure. */
                srtt = m << 3;          /* take the measured time to be rtt */
-                tp->mdev = m << 1;      /* make sure rto = 3*rtt */
+                tp->mdev_us = m << 1;   /* make sure rto = 3*rtt */
-                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+                tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+                tp->mdev_max_us = tp->rttvar_us;
                tp->rtt_seq = tp->snd_nxt;
        }
-        tp->srtt = max(1U, srtt);
+        tp->srtt_us = max(1U, srtt);
 }
 /* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@@ -742,20 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
        u64 rate;
        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
-        rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+        rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
        rate *= max(tp->snd_cwnd, tp->packets_out);
-        /* Correction for small srtt and scheduling constraints.
+        if (likely(tp->srtt_us))
-         * For small rtt, consider noise is too high, and use
+                do_div(rate, tp->srtt_us);
-         * the minimal value (srtt = 1 -> 125 us for HZ=1000)
-         *
-         * We probably need usec resolution in the future.
-         * Note: This also takes care of possible srtt=0 case,
-         * when tcp_rtt_estimator() was not yet called.
-         */
-        if (tp->srtt > 8 + 2)
-                do_div(rate, tp->srtt);
        /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
         * without any lock. We want to make sure compiler wont store
@@ -1122,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
 }
 struct tcp_sacktag_state {
-        int reord;
+        int     reord;
-        int fack_count;
+        int     fack_count;
-        int flag;
+        long    rtt_us; /* RTT measured by SACKing never-retransmitted data */
-        s32 rtt; /* RTT measured by SACKing never-retransmitted data */
+        int     flag;
 };
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1186,7 +1179,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 static u8 tcp_sacktag_one(struct sock *sk,
                          struct tcp_sacktag_state *state, u8 sacked,
                          u32 start_seq, u32 end_seq,
-                          int dup_sack, int pcount, u32 xmit_time)
+                          int dup_sack, int pcount,
+                          const struct skb_mstamp *xmit_time)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int fack_count = state->fack_count;
@@ -1227,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
                                if (!after(end_seq, tp->high_seq))
                                        state->flag |= FLAG_ORIG_SACK_ACKED;
                                /* Pick the earliest sequence sacked for RTT */
-                                if (state->rtt < 0)
+                                if (state->rtt_us < 0) {
-                                        state->rtt = tcp_time_stamp - xmit_time;
+                                        struct skb_mstamp now;
+                                        skb_mstamp_get(&now);
+                                        state->rtt_us = skb_mstamp_us_delta(&now,
+                                                                xmit_time);
+                                }
                        }
                        if (sacked & TCPCB_LOST) {
@@ -1287,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
         */
        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                        start_seq, end_seq, dup_sack, pcount,
-                        TCP_SKB_CB(skb)->when);
+                        &skb->skb_mstamp);
        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;
@@ -1565,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                TCP_SKB_CB(skb)->end_seq,
                                                dup_sack,
                                                tcp_skb_pcount(skb),
-                                                TCP_SKB_CB(skb)->when);
+                                                &skb->skb_mstamp);
                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
@@ -1622,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
 static int
 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
-                        u32 prior_snd_una, s32 *sack_rtt)
+                        u32 prior_snd_una, long *sack_rtt_us)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1640,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
        state.flag = 0;
        state.reord = tp->packets_out;
-        state.rtt = -1;
+        state.rtt_us = -1L;
        if (!tp->sacked_out) {
                if (WARN_ON(tp->fackets_out))
@@ -1824,7 +1823,7 @@ out:
        WARN_ON((int)tp->retrans_out < 0);
        WARN_ON((int)tcp_packets_in_flight(tp) < 0);
 #endif
-        *sack_rtt = state.rtt;
+        *sack_rtt_us = state.rtt_us;
        return state.flag;
 }
@@ -2035,10 +2034,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
         * available, or RTO is scheduled to fire first.
         */
        if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
-            (flag & FLAG_ECE) || !tp->srtt)
+            (flag & FLAG_ECE) || !tp->srtt_us)
                return false;
-        delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
+        delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
+                    msecs_to_jiffies(2));
        if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
                return false;
@@ -2885,7 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 }
 static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
-                                      s32 seq_rtt, s32 sack_rtt)
+                                      long seq_rtt_us, long sack_rtt_us)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
@@ -2895,10 +2896,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
         * is acked (RFC6298).
         */
        if (flag & FLAG_RETRANS_DATA_ACKED)
-                seq_rtt = -1;
+                seq_rtt_us = -1L;
-        if (seq_rtt < 0)
+        if (seq_rtt_us < 0)
-                seq_rtt = sack_rtt;
+                seq_rtt_us = sack_rtt_us;
        /* RTTM Rule: A TSecr value received in a segment is used to
         * update the averaged RTT measurement only if the segment
@@ -2906,14 +2907,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
         * left edge of the send window.
         * See draft-ietf-tcplw-high-performance-00, section 3.3.
         */
-        if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+        if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
            flag & FLAG_ACKED)
-                seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+                seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-        if (seq_rtt < 0)
+        if (seq_rtt_us < 0)
                return false;
-        tcp_rtt_estimator(sk, seq_rtt);
+        tcp_rtt_estimator(sk, seq_rtt_us);
        tcp_set_rto(sk);
        /* RFC6298: only reset backoff on valid RTT measurement. */
@@ -2925,16 +2926,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        s32 seq_rtt = -1;
+        long seq_rtt_us = -1L;
        if (synack_stamp && !tp->total_retrans)
-                seq_rtt = tcp_time_stamp - synack_stamp;
+                seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
        /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
         * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
         */
-        if (!tp->srtt)
+        if (!tp->srtt_us)
-                tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
+                tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
 }
 static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
@@ -3023,26 +3024,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 * arrived at the other end.
 */
 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
-                               u32 prior_snd_una, s32 sack_rtt)
+                               u32 prior_snd_una, long sack_rtt_us)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
-        struct sk_buff *skb;
+        struct skb_mstamp first_ackt, last_ackt, now;
-        u32 now = tcp_time_stamp;
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 prior_sacked = tp->sacked_out;
+        u32 reord = tp->packets_out;
        bool fully_acked = true;
-        int flag = 0;
+        long ca_seq_rtt_us = -1L;
+        long seq_rtt_us = -1L;
+        struct sk_buff *skb;
        u32 pkts_acked = 0;
-        u32 reord = tp->packets_out;
-        u32 prior_sacked = tp->sacked_out;
-        s32 seq_rtt = -1;
-        s32 ca_seq_rtt = -1;
-        ktime_t last_ackt = net_invalid_timestamp();
        bool rtt_update;
+        int flag = 0;
+        first_ackt.v64 = 0;
        while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-                u32 acked_pcount;
                u8 sacked = scb->sacked;
+                u32 acked_pcount;
                /* Determine how many packets and what bytes were acked, tso and else */
                if (after(scb->end_seq, tp->snd_una)) {
@@ -3064,11 +3066,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                tp->retrans_out -= acked_pcount;
                        flag |= FLAG_RETRANS_DATA_ACKED;
                } else {
-                        ca_seq_rtt = now - scb->when;
+                        last_ackt = skb->skb_mstamp;
-                        last_ackt = skb->tstamp;
+                        WARN_ON_ONCE(last_ackt.v64 == 0);
-                        if (seq_rtt < 0) {
+                        if (!first_ackt.v64)
-                                seq_rtt = ca_seq_rtt;
+                                first_ackt = last_ackt;
-                        }
                        if (!(sacked & TCPCB_SACKED_ACKED))
                                reord = min(pkts_acked, reord);
                        if (!after(scb->end_seq, tp->high_seq))
@@ -3114,7 +3116,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                flag |= FLAG_SACK_RENEGING;
-        rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
+        skb_mstamp_get(&now);
+        if (first_ackt.v64) {
+                seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
+                ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+        }
+        rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
        if (flag & FLAG_ACKED) {
                const struct tcp_congestion_ops *ca_ops
@@ -3142,25 +3150,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                tp->fackets_out -= min(pkts_acked, tp->fackets_out);
-                if (ca_ops->pkts_acked) {
+                if (ca_ops->pkts_acked)
-                        s32 rtt_us = -1;
+                        ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
-                        /* Is the ACK triggering packet unambiguous? */
-                        if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
-                                /* High resolution needed and available? */
-                                if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
-                                    !ktime_equal(last_ackt,
-                                                 net_invalid_timestamp()))
-                                        rtt_us = ktime_us_delta(ktime_get_real(),
-                                                                last_ackt);
-                                else if (ca_seq_rtt >= 0)
-                                        rtt_us = jiffies_to_usecs(ca_seq_rtt);
-                        }
-                        ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+        } else if (skb && rtt_update && sack_rtt_us >= 0 &&
-                }
+                   sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
-        } else if (skb && rtt_update && sack_rtt >= 0 &&
-                   sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
                /* Do not re-arm RTO if the sack RTT is measured from data sent
                 * after when the head was last (re)transmitted. Otherwise the
                 * timeout may continue to extend in loss recovery.
@@ -3370,12 +3364,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
-        u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
+        u32 prior_in_flight;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
        const int prior_unsacked = tp->packets_out - tp->sacked_out;
        int acked = 0; /* Number of packets newly acked */
-        s32 sack_rtt = -1;
+        long sack_rtt_us = -1L;
        /* If the ack is older than previous acks
         * then we can probably ignore it.
@@ -3433,7 +3427,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
                if (TCP_SKB_CB(skb)->sacked)
                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-                                                        &sack_rtt);
+                                                        &sack_rtt_us);
                if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
                        flag |= FLAG_ECE;
@@ -3452,7 +3446,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        /* See if we can take anything off of the retransmit queue. */
        acked = tp->packets_out;
-        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
+        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+                                    sack_rtt_us);
        acked -= tp->packets_out;
        /* Advance cwnd if state allows */
@@ -3475,8 +3470,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
-        if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+        tcp_update_pacing_rate(sk);
-                tcp_update_pacing_rate(sk);
        return 1;
 no_queue:
@@ -3505,7 +3499,7 @@ old_ack:
         */
        if (TCP_SKB_CB(skb)->sacked) {
                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-                                                &sack_rtt);
+                                                &sack_rtt_us);
                tcp_fastretrans_alert(sk, acked, prior_unsacked,
                                      is_dupack, flag);
        }
@@ -5401,9 +5395,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                                break;
                }
                tcp_rearm_rto(sk);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
                return true;
        }
        tp->syn_data_acked = tp->syn_data;
+        if (tp->syn_data_acked)
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
        return false;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e4eac779f51..6379894ec210 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -435,7 +435,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                        break;
                icsk->icsk_backoff--;
-                inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
+                inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
                        TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
                tcp_bound_rto(sk);
@@ -854,8 +854,10 @@ static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 {
        int res = tcp_v4_send_synack(sk, NULL, req, 0);
-        if (!res)
+        if (!res) {
                TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+        }
        return res;
 }
@@ -878,8 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk,
        bool want_cookie = false;
        struct listen_sock *lopt;
 #ifdef CONFIG_SYN_COOKIES
        if (sysctl_tcp_syncookies) {
                msg = "Sending cookies";
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 991d62a2f9bb..c9aecae31327 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -315,11 +315,9 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
 }
 static struct tcp_congestion_ops tcp_lp __read_mostly = {
-        .flags = TCP_CONG_RTT_STAMP,
        .init = tcp_lp_init,
        .ssthresh = tcp_reno_ssthresh,
        .cong_avoid = tcp_lp_cong_avoid,
-        .min_cwnd = tcp_reno_min_cwnd,
        .pkts_acked = tcp_lp_pkts_acked,
        .owner = THIS_MODULE,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..d4f015ad6c84 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 }
 static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
-                            const char *buffer)
+                            char *buffer)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        unsigned long long val;
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
 static int __init tcp_memcontrol_init(void)
 {
-        WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+        WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
        return 0;
 }
 __initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d547075d8300..dcaf72f10216 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -33,6 +33,11 @@ struct tcp_fastopen_metrics {
        struct  tcp_fastopen_cookie     cookie;
 };
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
 struct tcp_metrics_block {
        struct tcp_metrics_block __rcu  *tcpm_next;
        struct inetpeer_addr            tcpm_saddr;
@@ -41,7 +46,7 @@ struct tcp_metrics_block {
        u32                             tcpm_ts;
        u32                             tcpm_ts_stamp;
        u32                             tcpm_lock;
-        u32                             tcpm_vals[TCP_METRIC_MAX + 1];
+        u32                             tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
        struct tcp_fastopen_metrics     tcpm_fastopen;
        struct rcu_head                 rcu_head;
@@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,
        return tm->tcpm_vals[idx];
 }
-static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
-                                  enum tcp_metric_index idx)
-{
-        return msecs_to_jiffies(tm->tcpm_vals[idx]);
-}
 static void tcp_metric_set(struct tcp_metrics_block *tm,
                           enum tcp_metric_index idx,
                           u32 val)
@@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
        tm->tcpm_vals[idx] = val;
 }
-static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
-                                 enum tcp_metric_index idx,
-                                 u32 val)
-{
-        tm->tcpm_vals[idx] = jiffies_to_msecs(val);
-}
 static bool addr_same(const struct inetpeer_addr *a,
                      const struct inetpeer_addr *b)
 {
@@ -101,9 +93,11 @@ struct tcpm_hash_bucket {
 static DEFINE_SPINLOCK(tcp_metrics_lock);
-static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+                          const struct dst_entry *dst,
                          bool fastopen_clear)
 {
+        u32 msval;
        u32 val;
        tm->tcpm_stamp = jiffies;
@@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
                val |= 1 << TCP_METRIC_REORDERING;
        tm->tcpm_lock = val;
-        tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
+        msval = dst_metric_raw(dst, RTAX_RTT);
-        tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+        tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+        msval = dst_metric_raw(dst, RTAX_RTTVAR);
+        tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
        tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
        tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
        tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
@@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)
                dst_confirm(dst);
        rcu_read_lock();
-        if (icsk->icsk_backoff || !tp->srtt) {
+        if (icsk->icsk_backoff || !tp->srtt_us) {
                /* This session failed to estimate rtt. Why?
                 * Probably, no packets returned in time.  Reset our
                 * results.
@@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)
        if (!tm)
                goto out_unlock;
-        rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+        rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
-        m = rtt - tp->srtt;
+        m = rtt - tp->srtt_us;
        /* If newly calculated rtt larger than stored one, store new
         * one. Otherwise, use EWMA. Remember, rtt overestimation is
@@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)
         */
        if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
                if (m <= 0)
-                        rtt = tp->srtt;
+                        rtt = tp->srtt_us;
                else
                        rtt -= (m >> 3);
-                tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+                tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
        }
        if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
@@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)
                /* Scale deviation to rttvar fixed point */
                m >>= 1;
-                if (m < tp->mdev)
+                if (m < tp->mdev_us)
-                        m = tp->mdev;
+                        m = tp->mdev_us;
-                var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+                var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
                if (m >= var)
                        var = m;
                else
                        var -= (var - m) >> 2;
-                tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+                tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
        }
        if (tcp_in_initial_slowstart(tp)) {
@@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)
                tp->reordering = val;
        }
-        crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+        crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
        rcu_read_unlock();
 reset:
        /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
@@ -551,18 +548,20 @@ reset:
         * to low value, and then abruptly stops to do it and starts to delay
         * ACKs, wait for troubles.
         */
-        if (crtt > tp->srtt) {
+        if (crtt > tp->srtt_us) {
                /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
-                crtt >>= 3;
+                crtt /= 8 * USEC_PER_MSEC;
                inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
-        } else if (tp->srtt == 0) {
+        } else if (tp->srtt_us == 0) {
                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
                 * 3WHS. This is most likely due to retransmission,
                 * including spurious one. Reset the RTO back to 3secs
                 * from the more aggressive 1sec to avoid more spurious
                 * retransmission.
                 */
-                tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+                tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+                tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
        }
        /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
@@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
                nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
                if (!nest)
                        goto nla_put_failure;
-                for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
+                for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
-                        if (!tm->tcpm_vals[i])
+                        u32 val = tm->tcpm_vals[i];
+                        if (!val)
                                continue;
-                        if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
+                        if (i == TCP_METRIC_RTT) {
+                                if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+                                                val) < 0)
+                                        goto nla_put_failure;
+                                n++;
+                                val = max(val / 1000, 1U);
+                        }
+                        if (i == TCP_METRIC_RTTVAR) {
+                                if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+                                                val) < 0)
+                                        goto nla_put_failure;
+                                n++;
+                                val = max(val / 1000, 1U);
+                        }
+                        if (nla_put_u32(msg, i + 1, val) < 0)
                                goto nla_put_failure;
                        n++;
                }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7a436c517e44..ca788ada5bd3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -398,8 +398,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                tcp_init_wl(newtp, treq->rcv_isn);
-                newtp->srtt = 0;
+                newtp->srtt_us = 0;
-                newtp->mdev = TCP_TIMEOUT_INIT;
+                newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
                newicsk->icsk_rto = TCP_TIMEOUT_INIT;
                newtp->packets_out = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 17a11e65e57f..699fb102e971 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -86,6 +86,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                tcp_rearm_rto(sk);
        }
+        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+                      tcp_skb_pcount(skb));
 }
 /* SND.NXT, if window was not shrunk.
@@ -269,6 +272,7 @@ EXPORT_SYMBOL(tcp_select_initial_window);
 static u16 tcp_select_window(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        u32 old_win = tp->rcv_wnd;
        u32 cur_win = tcp_receive_window(tp);
        u32 new_win = __tcp_select_window(sk);
@@ -281,6 +285,9 @@ static u16 tcp_select_window(struct sock *sk)
                 *
                 * Relax Will Robinson.
                 */
+                if (new_win == 0)
+                        NET_INC_STATS(sock_net(sk),
+                                      LINUX_MIB_TCPWANTZEROWINDOWADV);
                new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
        }
        tp->rcv_wnd = new_win;
@@ -298,8 +305,14 @@ static u16 tcp_select_window(struct sock *sk)
        new_win >>= tp->rx_opt.rcv_wscale;
        /* If we advertise zero window, disable fast path. */
-        if (new_win == 0)
+        if (new_win == 0) {
                tp->pred_flags = 0;
+                if (old_win)
+                        NET_INC_STATS(sock_net(sk),
+                                      LINUX_MIB_TCPTOZEROWINDOWADV);
+        } else if (old_win == 0) {
+                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+        }
        return new_win;
 }
@@ -867,11 +880,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        if (clone_it) {
                const struct sk_buff *fclone = skb + 1;
-                /* If congestion control is doing timestamping, we must
+                skb_mstamp_get(&skb->skb_mstamp);
-                 * take such a timestamp before we potentially clone/copy.
-                 */
-                if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
-                        __net_timestamp(skb);
                if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
                             fclone->fclone == SKB_FCLONE_CLONE))
@@ -884,6 +893,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                        skb = skb_clone(skb, gfp_mask);
                if (unlikely(!skb))
                        return -ENOBUFS;
+                /* Our usage of tstamp should remain private */
+                skb->tstamp.tv64 = 0;
        }
        inet = inet_sk(sk);
@@ -1426,7 +1437,7 @@ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
 *    With Minshall's modification: all sent small packets are ACKed.
 */
 static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
-                            unsigned int mss_now, int nonagle)
+                            int nonagle)
 {
        return partial &&
                ((nonagle & TCP_NAGLE_CORK) ||
@@ -1458,7 +1469,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
         * to include this last segment in this skb.
         * Otherwise, we'll split the skb at last MSS boundary
         */
-        if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle))
+        if (tcp_nagle_check(partial != 0, tp, nonagle))
                return needed - partial;
        return needed;
@@ -1521,7 +1532,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
        if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
                return true;
-        if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
+        if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
                return true;
        return false;
@@ -1975,7 +1986,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        u32 timeout, tlp_time_stamp, rto_time_stamp;
-        u32 rtt = tp->srtt >> 3;
+        u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
        if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
                return false;
@@ -1997,7 +2008,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
        /* Schedule a loss probe in 2*RTT for SACK capable connections
         * in Open state, that are either limited by cwnd or application.
         */
-        if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||
+        if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
            !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
                return false;
@@ -2082,7 +2093,6 @@ rearm_timer:
        if (likely(!err))
                NET_INC_STATS_BH(sock_net(sk),
                                 LINUX_MIB_TCPLOSSPROBES);
-        return;
 }
 /* Push out any pending frames which were held back due to
@@ -2180,7 +2190,8 @@ u32 __tcp_select_window(struct sock *sk)
         */
        int mss = icsk->icsk_ack.rcv_mss;
        int free_space = tcp_space(sk);
-        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+        int allowed_space = tcp_full_space(sk);
+        int full_space = min_t(int, tp->window_clamp, allowed_space);
        int window;
        if (mss > full_space)
@@ -2193,7 +2204,19 @@ u32 __tcp_select_window(struct sock *sk)
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh,
                                               4U * tp->advmss);
-                if (free_space < mss)
+                /* free_space might become our new window, make sure we don't
+                 * increase it due to wscale.
+                 */
+                free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+                /* if free space is less than mss estimate, or is below 1/16th
+                 * of the maximum allowed, try to move to zero-window, else
+                 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+                 * new incoming data is dropped due to memory limits.
+                 * With large window, mss test triggers way too late in order
+                 * to announce zero window in time before rmem limit kicks in.
+                 */
+                if (free_space < (allowed_space >> 4) || free_space < mss)
                        return 0;
        }
@@ -2431,7 +2454,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        if (err == 0) {
                /* Update global TCP statistics. */
                TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
                tp->total_retrans++;
 #if FASTRETRANS_DEBUG > 0
@@ -2717,7 +2741,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        int tcp_header_size;
        int mss;
-        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+        skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
        if (unlikely(!skb)) {
                dst_release(dst);
                return NULL;
@@ -2787,7 +2811,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        th->window = htons(min(req->rcv_wnd, 65535U));
        tcp_options_write((__be32 *)(th + 1), tp, &opts);
        th->doff = (tcp_header_size >> 2);
-        TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
+        TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
 #ifdef CONFIG_TCP_MD5SIG
        /* Okay, we have all we need - do the md5 hash if needed */
@@ -2959,9 +2983,15 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
        tcp_connect_queue_skb(sk, data);
        fo->copied = data->len;
+        /* syn_data is about to be sent, we need to take current time stamps
+         * for the packets that are in write queue : SYN packet and DATA
+         */
+        skb_mstamp_get(&syn->skb_mstamp);
+        data->skb_mstamp = syn->skb_mstamp;
        if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
                tp->syn_data = (fo->copied > 0);
-                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
                goto done;
        }
        syn_data = NULL;
@@ -3049,8 +3079,9 @@ void tcp_send_delayed_ack(struct sock *sk)
                 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
                 * directly.
                 */
-                if (tp->srtt) {
+                if (tp->srtt_us) {
-                        int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+                        int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+                                        TCP_DELACK_MIN);
                        if (rtt < max_ato)
                                max_ato = rtt;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 1f2d37613c9e..3b66610d4156 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        p->snd_wnd = tp->snd_wnd;
                        p->rcv_wnd = tp->rcv_wnd;
                        p->ssthresh = tcp_current_ssthresh(sk);
-                        p->srtt = tp->srtt >> 3;
+                        p->srtt = tp->srtt_us >> 3;
                        tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
                }
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 19ea6c2951f3..0ac50836da4d 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -39,7 +39,6 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
 static struct tcp_congestion_ops tcp_scalable __read_mostly = {
        .ssthresh       = tcp_scalable_ssthresh,
        .cong_avoid     = tcp_scalable_cong_avoid,
-        .min_cwnd       = tcp_reno_min_cwnd,
        .owner          = THIS_MODULE,
        .name           = "scalable",
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 64f0354c84c7..286227abed10 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -165,6 +165,9 @@ static int tcp_write_timeout(struct sock *sk)
                        dst_negative_advice(sk);
                        if (tp->syn_fastopen || tp->syn_data)
                                tcp_fastopen_cache_set(sk, 0, NULL, true);
+                        if (tp->syn_data)
+                                NET_INC_STATS_BH(sock_net(sk),
+                                                 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
                }
                retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
                syn_set = true;
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 06cae62bf208..48539fff6357 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -306,11 +306,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
 static struct tcp_congestion_ops tcp_vegas __read_mostly = {
-        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_vegas_init,
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_vegas_cong_avoid,
-        .min_cwnd       = tcp_reno_min_cwnd,
        .pkts_acked     = tcp_vegas_pkts_acked,
        .set_state      = tcp_vegas_state,
        .cwnd_event     = tcp_vegas_cwnd_event,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 326475a94865..1b8e28fcd7e1 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -203,7 +203,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
 }
 static struct tcp_congestion_ops tcp_veno __read_mostly = {
-        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_veno_init,
        .ssthresh       = tcp_veno_ssthresh,
        .cong_avoid     = tcp_veno_cong_avoid,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 76a1e23259e1..b94a04ae2ed5 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -276,7 +276,6 @@ static struct tcp_congestion_ops tcp_westwood __read_mostly = {
        .init           = tcp_westwood_init,
        .ssthresh       = tcp_reno_ssthresh,
        .cong_avoid     = tcp_reno_cong_avoid,
-        .min_cwnd       = tcp_westwood_bw_rttmin,
        .cwnd_event     = tcp_westwood_event,
        .get_info       = tcp_westwood_info,
        .pkts_acked     = tcp_westwood_pkts_acked,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 1a8d271f994d..5ede0e727945 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -227,11 +227,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
 }
 static struct tcp_congestion_ops tcp_yeah __read_mostly = {
-        .flags          = TCP_CONG_RTT_STAMP,
        .init           = tcp_yeah_init,
        .ssthresh       = tcp_yeah_ssthresh,
        .cong_avoid     = tcp_yeah_cong_avoid,
-        .min_cwnd       = tcp_reno_min_cwnd,
        .set_state      = tcp_vegas_state,
        .cwnd_event     = tcp_vegas_cwnd_event,
        .get_info       = tcp_vegas_get_info,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 77bd16fa9f34..4468e1adc094 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -931,7 +931,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        sock_tx_timestamp(sk, &ipc.tx_flags);
        if (msg->msg_controllen) {
-                err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+                err = ip_cmsg_send(sock_net(sk), msg, &ipc,
+                                   sk->sk_family == AF_INET6);
                if (err)
                        return err;
                if (ipc.opt)
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 1f12c8b45864..aac6197b7a71 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -37,15 +37,6 @@ drop:
        return NET_RX_DROP;
 }
-int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
-                    int encap_type)
-{
-        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
-        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
-        return xfrm_input(skb, nexthdr, spi, encap_type);
-}
-EXPORT_SYMBOL(xfrm4_rcv_encap);
 int xfrm4_transport_finish(struct sk_buff *skb, int async)
 {
        struct iphdr *iph = ip_hdr(skb);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 31b18152528f..05f2b484954f 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,65 +15,6 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
-/* Informational hook. The decap is still done here. */
-static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly;
-static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
-int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler)
-{
-        struct xfrm_tunnel_notifier __rcu **pprev;
-        struct xfrm_tunnel_notifier *t;
-        int ret = -EEXIST;
-        int priority = handler->priority;
-        mutex_lock(&xfrm4_mode_tunnel_input_mutex);
-        for (pprev = &rcv_notify_handlers;
-             (t = rcu_dereference_protected(*pprev,
-             lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
-             pprev = &t->next) {
-                if (t->priority > priority)
-                        break;
-                if (t->priority == priority)
-                        goto err;
-        }
-        handler->next = *pprev;
-        rcu_assign_pointer(*pprev, handler);
-        ret = 0;
-err:
-        mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
-int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler)
-{
-        struct xfrm_tunnel_notifier __rcu **pprev;
-        struct xfrm_tunnel_notifier *t;
-        int ret = -ENOENT;
-        mutex_lock(&xfrm4_mode_tunnel_input_mutex);
-        for (pprev = &rcv_notify_handlers;
-             (t = rcu_dereference_protected(*pprev,
-             lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
-             pprev = &t->next) {
-                if (t == handler) {
-                        *pprev = handler->next;
-                        ret = 0;
-                        break;
-                }
-        }
-        mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
-        synchronize_net();
-        return ret;
-}
-EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
 static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
 {
        struct iphdr *inner_iph = ipip_hdr(skb);
@@ -127,14 +68,8 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
        return 0;
 }
-#define for_each_input_rcu(head, handler)       \
-        for (handler = rcu_dereference(head);   \
-             handler != NULL;                   \
-             handler = rcu_dereference(handler->next))
 static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 {
-        struct xfrm_tunnel_notifier *handler;
        int err = -EINVAL;
        if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -143,9 +78,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto out;
-        for_each_input_rcu(rcv_notify_handlers, handler)
-                handler->handler(skb);
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto out;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e1a63930a967..6156f68a1e90 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -325,6 +325,7 @@ void __init xfrm4_init(void)
        xfrm4_state_init();
        xfrm4_policy_init();
+        xfrm4_protocol_init();
 #ifdef CONFIG_SYSCTL
        register_pernet_subsys(&xfrm4_net_ops);
 #endif
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 000000000000..7f7b243e8139
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,286 @@
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+        switch (protocol) {
+        case IPPROTO_ESP:
+                return &esp4_handlers;
+        case IPPROTO_AH:
+                return &ah4_handlers;
+        case IPPROTO_COMP:
+                return &ipcomp4_handlers;
+        }
+        return NULL;
+}
+#define for_each_protocol_rcu(head, handler)            \
+        for (handler = rcu_dereference(head);           \
+             handler != NULL;                           \
+             handler = rcu_dereference(handler->next))  \
+int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+        int ret;
+        struct xfrm4_protocol *handler;
+        for_each_protocol_rcu(*proto_handlers(protocol), handler)
+                if ((ret = handler->cb_handler(skb, err)) <= 0)
+                        return ret;
+        return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_cb);
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+                    int encap_type)
+{
+        int ret;
+        struct xfrm4_protocol *handler;
+        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+        for_each_protocol_rcu(*proto_handlers(nexthdr), handler)
+                if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+                        return ret;
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        kfree_skb(skb);
+        return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+        int ret;
+        struct xfrm4_protocol *handler;
+        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+        for_each_protocol_rcu(esp4_handlers, handler)
+                if ((ret = handler->handler(skb)) != -EINVAL)
+                        return ret;
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        kfree_skb(skb);
+        return 0;
+}
+static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+        struct xfrm4_protocol *handler;
+        for_each_protocol_rcu(esp4_handlers, handler)
+                if (!handler->err_handler(skb, info))
+                        break;
+}
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+        int ret;
+        struct xfrm4_protocol *handler;
+        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+        for_each_protocol_rcu(ah4_handlers, handler)
+                if ((ret = handler->handler(skb)) != -EINVAL)
+                        return ret;;
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        kfree_skb(skb);
+        return 0;
+}
+static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+        struct xfrm4_protocol *handler;
+        for_each_protocol_rcu(ah4_handlers, handler)
+                if (!handler->err_handler(skb, info))
+                        break;
+}
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+        int ret;
+        struct xfrm4_protocol *handler;
+        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+        for_each_protocol_rcu(ipcomp4_handlers, handler)
+                if ((ret = handler->handler(skb)) != -EINVAL)
+                        return ret;
+        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+        kfree_skb(skb);
+        return 0;
+}
+static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+        struct xfrm4_protocol *handler;
+        for_each_protocol_rcu(ipcomp4_handlers, handler)
+                if (!handler->err_handler(skb, info))
+                        break;
+}
+static const struct net_protocol esp4_protocol = {
+        .handler        =       xfrm4_esp_rcv,
+        .err_handler    =       xfrm4_esp_err,
+        .no_policy      =       1,
+        .netns_ok       =       1,
+};
+static const struct net_protocol ah4_protocol = {
+        .handler        =       xfrm4_ah_rcv,
+        .err_handler    =       xfrm4_ah_err,
+        .no_policy      =       1,
+        .netns_ok       =       1,
+};
+static const struct net_protocol ipcomp4_protocol = {
+        .handler        =       xfrm4_ipcomp_rcv,
+        .err_handler    =       xfrm4_ipcomp_err,
+        .no_policy      =       1,
+        .netns_ok       =       1,
+};
+static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+        .family         =       AF_INET,
+        .owner          =       THIS_MODULE,
+        .callback       =       xfrm4_rcv_cb,
+};
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+        switch (protocol) {
+        case IPPROTO_ESP:
+                return &esp4_protocol;
+        case IPPROTO_AH:
+                return &ah4_protocol;
+        case IPPROTO_COMP:
+                return &ipcomp4_protocol;
+        }
+        return NULL;
+}
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+                            unsigned char protocol)
+{
+        struct xfrm4_protocol __rcu **pprev;
+        struct xfrm4_protocol *t;
+        bool add_netproto = false;
+        int ret = -EEXIST;
+        int priority = handler->priority;
+        mutex_lock(&xfrm4_protocol_mutex);
+        if (!rcu_dereference_protected(*proto_handlers(protocol),
+                                       lockdep_is_held(&xfrm4_protocol_mutex)))
+                add_netproto = true;
+        for (pprev = proto_handlers(protocol);
+             (t = rcu_dereference_protected(*pprev,
+                        lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+             pprev = &t->next) {
+                if (t->priority < priority)
+                        break;
+                if (t->priority == priority)
+                        goto err;
+        }
+        handler->next = *pprev;
+        rcu_assign_pointer(*pprev, handler);
+        ret = 0;
+err:
+        mutex_unlock(&xfrm4_protocol_mutex);
+        if (add_netproto) {
+                if (inet_add_protocol(netproto(protocol), protocol)) {
+                        pr_err("%s: can't add protocol\n", __func__);
+                        ret = -EAGAIN;
+                }
+        }
+        return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+                              unsigned char protocol)
+{
+        struct xfrm4_protocol __rcu **pprev;
+        struct xfrm4_protocol *t;
+        int ret = -ENOENT;
+        mutex_lock(&xfrm4_protocol_mutex);
+        for (pprev = proto_handlers(protocol);
+             (t = rcu_dereference_protected(*pprev,
+                        lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+             pprev = &t->next) {
+                if (t == handler) {
+                        *pprev = handler->next;
+                        ret = 0;
+                        break;
+                }
+        }
+        if (!rcu_dereference_protected(*proto_handlers(protocol),
+                                       lockdep_is_held(&xfrm4_protocol_mutex))) {
+                if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+                        pr_err("%s: can't remove protocol\n", __func__);
+                        ret = -EAGAIN;
+                }
+        }
+        mutex_unlock(&xfrm4_protocol_mutex);
+        synchronize_net();
+        return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+void __init xfrm4_protocol_init(void)
+{
+        xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
+EXPORT_SYMBOL(xfrm4_protocol_init);